calebnwokocha commited on
Commit
fa50005
·
verified ·
1 Parent(s): f5b46d0

Upload 5 files

Browse files
Files changed (5) hide show
  1. GPT2.cbp +0 -5
  2. GPT2.cscope_file_list +11 -15
  3. GPT2.depend +6 -5
  4. GPT2.layout +29 -29
  5. main-ctx.cpp +1213 -839
GPT2.cbp CHANGED
@@ -34,11 +34,6 @@
34
  </Compiler>
35
  <Unit filename="GPT2.cbp" />
36
  <Unit filename="GPT2.layout" />
37
- <Unit filename="common-ggml.cpp" />
38
- <Unit filename="common-ggml.h" />
39
- <Unit filename="common.cpp" />
40
- <Unit filename="common.h" />
41
- <Unit filename="dr_wav.h" />
42
  <Unit filename="ggml-aarch64.c">
43
  <Option compilerVar="CC" />
44
  </Unit>
 
34
  </Compiler>
35
  <Unit filename="GPT2.cbp" />
36
  <Unit filename="GPT2.layout" />
 
 
 
 
 
37
  <Unit filename="ggml-aarch64.c">
38
  <Option compilerVar="CC" />
39
  </Unit>
GPT2.cscope_file_list CHANGED
@@ -1,22 +1,18 @@
1
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.h"
2
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.c"
3
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-impl.h"
4
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.cpp"
5
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.c"
6
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.h"
7
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.cpp"
8
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.h"
9
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend-impl.h"
10
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.h"
11
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-common.h"
 
 
 
 
12
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.c"
13
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.h"
14
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.h"
15
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.cbp"
16
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.h"
17
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-cpu-impl.h"
18
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.c"
19
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.layout"
20
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.cpp"
21
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\dr_wav.h"
22
- "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\main-ctx.cpp"
 
 
 
 
 
 
 
 
 
1
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.h"
2
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.cpp"
 
 
3
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.h"
4
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.c"
5
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-impl.h"
6
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.cpp"
7
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.c"
8
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.layout"
9
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.c"
 
10
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.h"
11
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\main-ctx.cpp"
12
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.h"
13
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-cpu-impl.h"
 
 
 
14
  "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\dr_wav.h"
15
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.h"
16
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.cbp"
17
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-common.h"
18
+ "C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\quantize.cpp"
GPT2.depend CHANGED
@@ -222,12 +222,8 @@
222
 
223
  1730683892 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-alloc.cpp
224
 
225
- 1730737838 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-ctx.cpp
226
-
227
- 1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\quantize.cpp
228
  "ggml.h"
229
- "common.h"
230
- "common-ggml.h"
231
  <cassert>
232
  <cmath>
233
  <cstdio>
@@ -236,5 +232,10 @@
236
  <map>
237
  <string>
238
  <vector>
 
 
 
239
  <regex>
240
 
 
 
 
222
 
223
  1730683892 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-alloc.cpp
224
 
225
+ 1731878749 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-ctx.cpp
 
 
226
  "ggml.h"
 
 
227
  <cassert>
228
  <cmath>
229
  <cstdio>
 
232
  <map>
233
  <string>
234
  <vector>
235
+ <thread>
236
+ <ctime>
237
+ <random>
238
  <regex>
239
 
240
+ 1730831644 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\quantize.cpp
241
+
GPT2.layout CHANGED
@@ -2,47 +2,42 @@
2
  <CodeBlocks_layout_file>
3
  <FileVersion major="1" minor="0" />
4
  <ActiveTarget name="Debug" />
5
- <File name="ggml-impl.h" open="1" top="0" tabpos="10" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
6
  <Cursor>
7
- <Cursor1 position="6388" topLine="0" />
8
  </Cursor>
9
  </File>
10
- <File name="common-ggml.cpp" open="1" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
11
  <Cursor>
12
- <Cursor1 position="223" topLine="135" />
13
  </Cursor>
14
  </File>
15
- <File name="ggml-quants.c" open="1" top="0" tabpos="11" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
16
  <Cursor>
17
- <Cursor1 position="2705" topLine="0" />
18
- </Cursor>
19
- </File>
20
- <File name="ggml-aarch64.h" open="1" top="0" tabpos="8" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
21
- <Cursor>
22
- <Cursor1 position="1519" topLine="0" />
23
  </Cursor>
24
  </File>
25
- <File name="common.cpp" open="1" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
26
  <Cursor>
27
- <Cursor1 position="152" topLine="0" />
28
  </Cursor>
29
  </File>
30
- <File name="ggml-quants.h" open="1" top="0" tabpos="13" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
31
  <Cursor>
32
- <Cursor1 position="0" topLine="128" />
33
  </Cursor>
34
  </File>
35
- <File name="quantize.cpp" open="1" top="1" tabpos="15" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
36
  <Cursor>
37
- <Cursor1 position="4241" topLine="139" />
38
  </Cursor>
39
  </File>
40
- <File name="ggml.h" open="1" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
41
  <Cursor>
42
- <Cursor1 position="8069" topLine="212" />
43
  </Cursor>
44
  </File>
45
- <File name="ggml-common.h" open="1" top="0" tabpos="14" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
46
  <Cursor>
47
  <Cursor1 position="0" topLine="0" />
48
  </Cursor>
@@ -52,29 +47,34 @@
52
  <Cursor1 position="522" topLine="0" />
53
  </Cursor>
54
  </File>
55
- <File name="common.h" open="1" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
56
  <Cursor>
57
  <Cursor1 position="0" topLine="0" />
58
  </Cursor>
59
  </File>
60
- <File name="common-ggml.h" open="1" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
61
  <Cursor>
62
- <Cursor1 position="141" topLine="0" />
63
  </Cursor>
64
  </File>
65
- <File name="ggml-cpu-impl.h" open="1" top="0" tabpos="9" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
66
  <Cursor>
67
- <Cursor1 position="0" topLine="0" />
68
  </Cursor>
69
  </File>
70
- <File name="ggml-aarch64.c" open="1" top="0" tabpos="7" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
71
  <Cursor>
72
- <Cursor1 position="442" topLine="0" />
 
 
 
 
 
73
  </Cursor>
74
  </File>
75
- <File name="main-ctx.cpp" open="1" top="0" tabpos="12" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
76
  <Cursor>
77
- <Cursor1 position="114" topLine="659" />
78
  </Cursor>
79
  </File>
80
  </CodeBlocks_layout_file>
 
2
  <CodeBlocks_layout_file>
3
  <FileVersion major="1" minor="0" />
4
  <ActiveTarget name="Debug" />
5
+ <File name="ggml-quants.c" open="1" top="0" tabpos="11" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
6
  <Cursor>
7
+ <Cursor1 position="2705" topLine="0" />
8
  </Cursor>
9
  </File>
10
+ <File name="ggml-quants.h" open="1" top="0" tabpos="13" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
11
  <Cursor>
12
+ <Cursor1 position="0" topLine="128" />
13
  </Cursor>
14
  </File>
15
+ <File name="quantize.cpp" open="1" top="0" tabpos="15" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
16
  <Cursor>
17
+ <Cursor1 position="4241" topLine="139" />
 
 
 
 
 
18
  </Cursor>
19
  </File>
20
+ <File name="common.h" open="1" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
21
  <Cursor>
22
+ <Cursor1 position="0" topLine="0" />
23
  </Cursor>
24
  </File>
25
+ <File name="ggml-aarch64.h" open="1" top="0" tabpos="8" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
26
  <Cursor>
27
+ <Cursor1 position="1519" topLine="0" />
28
  </Cursor>
29
  </File>
30
+ <File name="common-ggml.cpp" open="1" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
31
  <Cursor>
32
+ <Cursor1 position="223" topLine="135" />
33
  </Cursor>
34
  </File>
35
+ <File name="main-ctx.cpp" open="1" top="1" tabpos="12" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
36
  <Cursor>
37
+ <Cursor1 position="26874" topLine="722" />
38
  </Cursor>
39
  </File>
40
+ <File name="ggml-cpu-impl.h" open="1" top="0" tabpos="9" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
41
  <Cursor>
42
  <Cursor1 position="0" topLine="0" />
43
  </Cursor>
 
47
  <Cursor1 position="522" topLine="0" />
48
  </Cursor>
49
  </File>
50
+ <File name="ggml-common.h" open="1" top="0" tabpos="14" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
51
  <Cursor>
52
  <Cursor1 position="0" topLine="0" />
53
  </Cursor>
54
  </File>
55
+ <File name="ggml-aarch64.c" open="1" top="0" tabpos="7" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
56
  <Cursor>
57
+ <Cursor1 position="442" topLine="0" />
58
  </Cursor>
59
  </File>
60
+ <File name="common.cpp" open="1" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
61
  <Cursor>
62
+ <Cursor1 position="152" topLine="0" />
63
  </Cursor>
64
  </File>
65
+ <File name="ggml.h" open="1" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
66
  <Cursor>
67
+ <Cursor1 position="26839" topLine="760" />
68
+ </Cursor>
69
+ </File>
70
+ <File name="ggml-impl.h" open="1" top="0" tabpos="10" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
71
+ <Cursor>
72
+ <Cursor1 position="6388" topLine="0" />
73
  </Cursor>
74
  </File>
75
+ <File name="common-ggml.h" open="1" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
76
  <Cursor>
77
+ <Cursor1 position="141" topLine="0" />
78
  </Cursor>
79
  </File>
80
  </CodeBlocks_layout_file>
main-ctx.cpp CHANGED
@@ -1,841 +1,1215 @@
1
- //#include "ggml.h"
2
- //
3
- //#include "common.h"
4
- //#include "common-ggml.h"
5
- //
6
- //#include <cassert>
7
- //#include <cmath>
8
- //#include <cstdio>
9
- //#include <cstring>
10
- //#include <fstream>
11
- //#include <map>
12
- //#include <string>
13
- //#include <vector>
14
- //
15
- //#if defined(_MSC_VER)
16
- //#pragma warning(disable: 4244 4267) // possible loss of data
17
- //#endif
18
- //
19
- //// default hparams (GPT-2 117M)
20
- //struct gpt2_hparams {
21
- // int32_t n_vocab = 50257; // Vocabulary size remains the same
22
- // int32_t n_ctx = 1024; // Maximum context length (sequence length)
23
- // int32_t n_embd = 1024; // Embedding dimensionality
24
- // int32_t n_head = 16; // Number of attention heads
25
- // int32_t n_layer = 24; // Number of transformer layers
26
- // int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
27
- // float eps = 1e-5f; // Small constant for numerical stability
28
- //};
29
- //
30
- //struct gpt2_layer {
31
- // // normalization
32
- // struct ggml_tensor * ln_1_g;
33
- // struct ggml_tensor * ln_1_b;
34
- //
35
- // struct ggml_tensor * ln_2_g;
36
- // struct ggml_tensor * ln_2_b;
37
- //
38
- // // attention
39
- // struct ggml_tensor * c_attn_attn_w;
40
- // struct ggml_tensor * c_attn_attn_b;
41
- //
42
- // struct ggml_tensor * c_attn_proj_w;
43
- // struct ggml_tensor * c_attn_proj_b;
44
- //
45
- // // mlp
46
- // struct ggml_tensor * c_mlp_fc_w;
47
- // struct ggml_tensor * c_mlp_fc_b;
48
- //
49
- // struct ggml_tensor * c_mlp_proj_w;
50
- // struct ggml_tensor * c_mlp_proj_b;
51
- //};
52
- //
53
- //struct gpt2_model {
54
- // gpt2_hparams hparams;
55
- //
56
- // // normalization
57
- // struct ggml_tensor * ln_f_g;
58
- // struct ggml_tensor * ln_f_b;
59
- //
60
- // struct ggml_tensor * wte; // position embedding
61
- // struct ggml_tensor * wpe; // token embedding
62
- // struct ggml_tensor * lm_head; // language model head
63
- //
64
- // std::vector<gpt2_layer> layers;
65
- //
66
- // // key + value memory
67
- // struct ggml_tensor * memory_k;
68
- // struct ggml_tensor * memory_v;
69
- //
70
- // //
71
- // struct ggml_context * ctx_w;
72
- // std::map<std::string, struct ggml_tensor *> tensors;
73
- //};
74
- //
75
- //// load the model's weights from a file
76
- //bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
77
- // printf("%s: loading model from '%s'\n", __func__, fname.c_str());
78
- //
79
- // auto fin = std::ifstream(fname, std::ios::binary);
80
- // if (!fin) {
81
- // fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
82
- // return false;
83
- // }
84
- //
85
- // // verify magic
86
- // {
87
- // uint32_t magic;
88
- // fin.read((char *) &magic, sizeof(magic));
89
- // if (magic != GGML_FILE_MAGIC) {
90
- // fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
91
- // return false;
92
- // }
93
- // }
94
- //
95
- // // load hparams
96
- // {
97
- // auto & hparams = model.hparams;
98
- //
99
- // fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
100
- // fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
101
- // fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
102
- // fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
103
- // fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
104
- // fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
105
- //
106
- // const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
107
- //
108
- // printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
109
- // printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
110
- // printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
111
- // printf("%s: n_head = %d\n", __func__, hparams.n_head);
112
- // printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
113
- // printf("%s: ftype = %d\n", __func__, hparams.ftype);
114
- // printf("%s: qntvr = %d\n", __func__, qntvr);
115
- //
116
- // hparams.ftype %= GGML_QNT_VERSION_FACTOR;
117
- // }
118
- //
119
- // // load vocab
120
- // {
121
- // int32_t n_vocab = 0;
122
- // fin.read((char *) &n_vocab, sizeof(n_vocab));
123
- //
124
- // if (n_vocab != model.hparams.n_vocab) {
125
- // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
126
- // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
127
- // return false;
128
- // }
129
- //
130
- // std::string word;
131
- // std::vector<char> buf(128);
132
- //
133
- // for (int i = 0; i < n_vocab; i++) {
134
- // uint32_t len;
135
- // fin.read((char *) &len, sizeof(len));
136
- //
137
- // buf.resize(len);
138
- // fin.read((char *) buf.data(), len);
139
- // word.assign(buf.data(), len);
140
- //
141
- // vocab.token_to_id[word] = i;
142
- // vocab.id_to_token[i] = word;
143
- // }
144
- // }
145
- //
146
- // // for the big tensors, we have the option to store the data in 16-bit floats or quantized
147
- // // in order to save memory and also to speed up the computation
148
- // ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
149
- // if (wtype == GGML_TYPE_COUNT) {
150
- // fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
151
- // __func__, fname.c_str(), model.hparams.ftype);
152
- // return false;
153
- // }
154
- //
155
- // auto & ctx = model.ctx_w;
156
- //
157
- // size_t ctx_size = 0;
158
- //
159
- // {
160
- // const auto & hparams = model.hparams;
161
- //
162
- // const int n_embd = hparams.n_embd;
163
- // const int n_layer = hparams.n_layer;
164
- // const int n_ctx = hparams.n_ctx;
165
- // const int n_vocab = hparams.n_vocab;
166
- //
167
- // ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
168
- // ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
169
- //
170
- // ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
171
- // ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
172
- // ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
173
- //
174
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
175
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
176
- //
177
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
178
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
179
- //
180
- // ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
181
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
182
- //
183
- // ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
184
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
185
- //
186
- // ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
187
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
188
- //
189
- // ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
190
- // ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
191
- //
192
- // ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
193
- // ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
194
- //
195
- // ctx_size += (6 + 12*n_layer)*512; // object overhead
196
- //
197
- // printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
198
- // printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
199
- // }
200
- //
201
- // // create the ggml context
202
- // {
203
- // struct ggml_init_params params = {
204
- // /*.mem_size =*/ ctx_size,
205
- // /*.mem_buffer =*/ NULL,
206
- // /*.no_alloc =*/ false,
207
- // };
208
- //
209
- // model.ctx_w = ggml_init(params);
210
- // if (!model.ctx_w) {
211
- // fprintf(stderr, "%s: ggml_init() failed\n", __func__);
212
- // return false;
213
- // }
214
- // }
215
- //
216
- // // prepare memory for the weights
217
- // {
218
- // const auto & hparams = model.hparams;
219
- //
220
- // const int n_embd = hparams.n_embd;
221
- // const int n_layer = hparams.n_layer;
222
- // const int n_ctx = hparams.n_ctx;
223
- // const int n_vocab = hparams.n_vocab;
224
- //
225
- // model.layers.resize(n_layer);
226
- //
227
- // model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
228
- // model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
229
- //
230
- // model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
231
- // model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
232
- // model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
233
- //
234
- // // map by name
235
- // model.tensors["model/ln_f/g"] = model.ln_f_g;
236
- // model.tensors["model/ln_f/b"] = model.ln_f_b;
237
- //
238
- // model.tensors["model/wte"] = model.wte;
239
- // model.tensors["model/wpe"] = model.wpe;
240
- // model.tensors["model/lm_head"] = model.lm_head;
241
- //
242
- // for (int i = 0; i < n_layer; ++i) {
243
- // auto & layer = model.layers[i];
244
- //
245
- // layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
246
- // layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
247
- //
248
- // layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
249
- // layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
250
- //
251
- // layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
252
- // layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
253
- //
254
- // layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
255
- // layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
256
- //
257
- // layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
258
- // layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
259
- //
260
- // layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
261
- // layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
262
- //
263
- // // map by name
264
- // model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
265
- // model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
266
- //
267
- // model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
268
- // model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
269
- //
270
- // model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
271
- // model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
272
- //
273
- // model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
274
- // model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
275
- //
276
- // model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
277
- // model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
278
- //
279
- // model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
280
- // model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
281
- // }
282
- // }
283
- //
284
- // // key + value memory
285
- // {
286
- // const auto & hparams = model.hparams;
287
- //
288
- // const int n_embd = hparams.n_embd;
289
- // const int n_layer = hparams.n_layer;
290
- // const int n_ctx = hparams.n_ctx;
291
- //
292
- // const int n_mem = n_layer*n_ctx;
293
- // const int n_elements = n_embd*n_mem;
294
- //
295
- // model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
296
- // model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
297
- //
298
- // const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
299
- //
300
- // printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
301
- // }
302
- //
303
- // // load weights
304
- // {
305
- // size_t total_size = 0;
306
- //
307
- // bool has_lm_head = false;
308
- //
309
- // while (true) {
310
- // int32_t n_dims;
311
- // int32_t length;
312
- // int32_t ttype;
313
- //
314
- // fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
315
- // fin.read(reinterpret_cast<char *>(&length), sizeof(length));
316
- // fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
317
- //
318
- // if (fin.eof()) {
319
- // break;
320
- // }
321
- //
322
- // int32_t nelements = 1;
323
- // int32_t ne[2] = { 1, 1 };
324
- // for (int i = 0; i < n_dims; ++i) {
325
- // fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
326
- // nelements *= ne[i];
327
- // }
328
- //
329
- // std::string name(length, 0);
330
- // fin.read(&name[0], length);
331
- //
332
- // if (model.tensors.find(name) == model.tensors.end()) {
333
- // fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
334
- // return false;
335
- // }
336
- //
337
- // auto tensor = model.tensors[name];
338
- // if (ggml_nelements(tensor) != nelements) {
339
- // fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
340
- // return false;
341
- // }
342
- //
343
- // if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
344
- // fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
345
- // __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
346
- // return false;
347
- // }
348
- //
349
- // // for debugging
350
- // if (0) {
351
- // printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
352
- // }
353
- //
354
- // const size_t bpe = ggml_type_size(ggml_type(ttype));
355
- //
356
- // if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
357
- // fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
358
- // __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
359
- // return false;
360
- // }
361
- //
362
- // fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
363
- //
364
- // // GPT-2 models share the WTE tensor as the LM head
365
- // if (name == "model/wte" && has_lm_head == false) {
366
- // memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
367
- // }
368
- //
369
- // if (name == "model/lm_head") {
370
- // has_lm_head = true;
371
- // }
372
- //
373
- // total_size += ggml_nbytes(tensor);
374
- // }
375
- //
376
- // printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
377
- // }
378
- //
379
- // fin.close();
380
- //
381
- // return true;
382
- //}
383
- //
384
- //// evaluate the transformer
385
- ////
386
- //// - model: the model
387
- //// - n_threads: number of threads to use
388
- //// - n_past: the context size so far
389
- //// - embd_inp: the embeddings of the tokens in the context
390
- //// - embd_w: the predicted logits for the next token
391
- ////
392
- //bool gpt2_eval(
393
- // const gpt2_model & model,
394
- // const int n_threads,
395
- // const int n_past,
396
- // const std::vector<gpt_vocab::id> & embd_inp,
397
- // std::vector<float> & embd_w,
398
- // size_t & mem_per_token) {
399
- // const int N = embd_inp.size();
400
- //
401
- // const auto & hparams = model.hparams;
402
- //
403
- // const int n_embd = hparams.n_embd;
404
- // const int n_layer = hparams.n_layer;
405
- // const int n_ctx = hparams.n_ctx;
406
- // const int n_head = hparams.n_head;
407
- // const int n_vocab = hparams.n_vocab;
408
- //
409
- // static size_t buf_size = 256u*1024*1024;
410
- // static void * buf = malloc(buf_size);
411
- //
412
- // if (mem_per_token > 0 && mem_per_token*N > buf_size) {
413
- // const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
414
- // //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
415
- //
416
- // // reallocate
417
- // buf_size = buf_size_new;
418
- // buf = realloc(buf, buf_size);
419
- // if (buf == nullptr) {
420
- // fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
421
- // return false;
422
- // }
423
- // }
424
- //
425
- // struct ggml_init_params params = {
426
- // /*.mem_size =*/ buf_size,
427
- // /*.mem_buffer =*/ buf,
428
- // /*.no_alloc =*/ false,
429
- // };
430
- //
431
- // struct ggml_context * ctx0 = ggml_init(params);
432
- // struct ggml_cgraph * gf = ggml_new_graph(ctx0);
433
- //
434
- // struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
435
- // memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
436
- //
437
- // struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
438
- // for (int i = 0; i < N; ++i) {
439
- // ((int32_t *) position->data)[i] = n_past + i;
440
- // }
441
- //
442
- // // wte + wpe
443
- // struct ggml_tensor * inpL =
444
- // ggml_add(ctx0,
445
- // ggml_get_rows(ctx0, model.wte, embd),
446
- // ggml_get_rows(ctx0, model.wpe, position));
447
- //
448
- // for (int il = 0; il < n_layer; ++il) {
449
- // struct ggml_tensor * cur;
450
- //
451
- // // norm
452
- // {
453
- // // [ 768, N]
454
- // cur = ggml_norm(ctx0, inpL, hparams.eps);
455
- //
456
- // // cur = ln_1_g*cur + ln_1_b
457
- // // [ 768, N]
458
- // cur = ggml_add(ctx0,
459
- // ggml_mul(ctx0,
460
- // ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
461
- // cur),
462
- // ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
463
- // }
464
- //
465
- // // attn
466
- // // [2304, 768] - model.layers[il].c_attn_attn_w
467
- // // [2304, 1] - model.layers[il].c_attn_attn_b
468
- // // [ 768, N] - cur (in)
469
- // // [2304, N] - cur (out)
470
- // //
471
- // // cur = attn_w*cur + attn_b
472
- // // [2304, N]
473
- // {
474
- // cur = ggml_mul_mat(ctx0,
475
- // model.layers[il].c_attn_attn_w,
476
- // cur);
477
- //
478
- // cur = ggml_add(ctx0,
479
- // ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
480
- // cur);
481
- // }
482
- //
483
- // // self-attention
484
- // {
485
- // struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
486
- // struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
487
- // struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
488
- //
489
- // // store key and value to memory
490
- // if (N >= 1) {
491
- // struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
492
- // struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
493
- //
494
- // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
495
- // ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
496
- // }
497
- //
498
- // // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
499
- // // [64, N, 12]
500
- // struct ggml_tensor * Q =
501
- // ggml_permute(ctx0,
502
- // ggml_cpy(ctx0,
503
- // Qcur,
504
- // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
505
- // 0, 2, 1, 3);
506
- //
507
- // // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
508
- // // [64, n_past + N, 12]
509
- // struct ggml_tensor * K =
510
- // ggml_permute(ctx0,
511
- // ggml_reshape_3d(ctx0,
512
- // ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
513
- // n_embd/n_head, n_head, n_past + N),
514
- // 0, 2, 1, 3);
515
- //
516
- // // GG: flash attention
517
- // //struct ggml_tensor * V =
518
- // // ggml_cpy(ctx0,
519
- // // ggml_permute(ctx0,
520
- // // ggml_reshape_3d(ctx0,
521
- // // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
522
- // // n_embd/n_head, n_head, n_past + N),
523
- // // 1, 2, 0, 3),
524
- // // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
525
- //
526
- // //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
527
- //
528
- // // K * Q
529
- // // [n_past + N, N, 12]
530
- // struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
531
- //
532
- // // KQ_scaled = KQ / sqrt(n_embd/n_head)
533
- // // [n_past + N, N, 12]
534
- // struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
535
- //
536
- // // KQ_masked = mask_past(KQ_scaled)
537
- // // [n_past + N, N, 12]
538
- // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
539
- //
540
- // // KQ = soft_max(KQ_masked)
541
- // // [n_past + N, N, 12]
542
- // struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
543
- //
544
- // // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
545
- // // [n_past + N, 64, 12]
546
- // struct ggml_tensor * V_trans =
547
- // ggml_cpy(ctx0,
548
- // ggml_permute(ctx0,
549
- // ggml_reshape_3d(ctx0,
550
- // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
551
- // n_embd/n_head, n_head, n_past + N),
552
- // 1, 2, 0, 3),
553
- // ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
554
- //
555
- // // KQV = transpose(V) * KQ_soft_max
556
- // // [64, N, 12]
557
- // struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
558
- //
559
- // // KQV_merged = KQV.permute(0, 2, 1, 3)
560
- // // [64, 12, N]
561
- // struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
562
- //
563
- // // cur = KQV_merged.contiguous().view(n_embd, N)
564
- // // [768, N]
565
- // cur = ggml_cpy(ctx0,
566
- // KQV_merged,
567
- // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
568
- // }
569
- //
570
- // // projection
571
- // // [ 768, 768] - model.layers[il].c_attn_proj_w
572
- // // [ 768, 1] - model.layers[il].c_attn_proj_b
573
- // // [ 768, N] - cur (in)
574
- // // [ 768, N] - cur (out)
575
- // //
576
- // // cur = proj_w*cur + proj_b
577
- // // [768, N]
578
- // {
579
- // cur = ggml_mul_mat(ctx0,
580
- // model.layers[il].c_attn_proj_w,
581
- // cur);
582
- //
583
- // cur = ggml_add(ctx0,
584
- // ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
585
- // cur);
586
- // }
587
- //
588
- // // add the input
589
- // cur = ggml_add(ctx0, cur, inpL);
590
- //
591
- // struct ggml_tensor * inpFF = cur;
592
- //
593
- // // feed-forward network
594
- // {
595
- // // norm
596
- // {
597
- // cur = ggml_norm(ctx0, inpFF, hparams.eps);
598
- //
599
- // // cur = ln_2_g*cur + ln_2_b
600
- // // [ 768, N]
601
- // cur = ggml_add(ctx0,
602
- // ggml_mul(ctx0,
603
- // ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
604
- // cur),
605
- // ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
606
- // }
607
- //
608
- // // fully connected
609
- // // [3072, 768] - model.layers[il].c_mlp_fc_w
610
- // // [3072, 1] - model.layers[il].c_mlp_fc_b
611
- // // [ 768, N] - cur (in)
612
- // // [3072, N] - cur (out)
613
- // //
614
- // // cur = fc_w*cur + fc_b
615
- // // [3072, N]
616
- // cur = ggml_mul_mat(ctx0,
617
- // model.layers[il].c_mlp_fc_w,
618
- // cur);
619
- //
620
- // cur = ggml_add(ctx0,
621
- // ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
622
- // cur);
623
- //
624
- // // GELU activation
625
- // // [3072, N]
626
- // cur = ggml_gelu(ctx0, cur);
627
- //
628
- // // projection
629
- // // [ 768, 3072] - model.layers[il].c_mlp_proj_w
630
- // // [ 768, 1] - model.layers[il].c_mlp_proj_b
631
- // // [3072, N] - cur (in)
632
- // // [ 768, N] - cur (out)
633
- // //
634
- // // cur = proj_w*cur + proj_b
635
- // // [768, N]
636
- // cur = ggml_mul_mat(ctx0,
637
- // model.layers[il].c_mlp_proj_w,
638
- // cur);
639
- //
640
- // cur = ggml_add(ctx0,
641
- // ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
642
- // cur);
643
- // }
644
- //
645
- // // input for next layer
646
- // inpL = ggml_add(ctx0, cur, inpFF);
647
- // }
648
- //
649
- // // norm
650
- // {
651
- // // [ 768, N]
652
- // inpL = ggml_norm(ctx0, inpL, hparams.eps);
653
- //
654
- // // inpL = ln_f_g*inpL + ln_f_b
655
- // // [ 768, N]
656
- // inpL = ggml_add(ctx0,
657
- // ggml_mul(ctx0,
658
- // ggml_repeat(ctx0, model.ln_f_g, inpL),
659
- // inpL),
660
- // ggml_repeat(ctx0, model.ln_f_b, inpL));
661
- // }
662
- //
663
- // // inpL = WTE * inpL
664
- // // [ 768, 50257] - model.lm_head
665
- // // [ 768, N] - inpL
666
- // inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
667
- //
668
- // // logits -> probs
669
- // //inpL = ggml_soft_max_inplace(ctx0, inpL);
670
- //
671
- // // run the computation
672
- // ggml_build_forward_expand(gf, inpL);
673
- // ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
674
- //
675
- // //if (n_past%100 == 0) {
676
- // // ggml_graph_print (&gf);
677
- // // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
678
- // //}
679
- //
680
- // //embd_w.resize(n_vocab*N);
681
- // //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
682
- //
683
- // // return result just for the last token
684
- // embd_w.resize(n_vocab);
685
- // memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
686
- //
687
- // if (mem_per_token == 0) {
688
- // mem_per_token = ggml_used_mem(ctx0)/N;
689
- // }
690
- // //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
691
- //
692
- // ggml_free(ctx0);
693
- //
694
- // return true;
695
- //}
696
- //
697
- //int main(int argc, char ** argv) {
698
- // ggml_time_init();
699
- //
700
- // const int64_t t_main_start_us = ggml_time_us();
701
- //
702
- // gpt_params params;
703
- // params.model = "ggml-model-gpt-2-774M.bin";
704
- //
705
- // if (gpt_params_parse(argc, argv, params) == false) {
706
- // return 1;
707
- // }
708
- //
709
- // if (params.seed < 0) {
710
- // params.seed = time(NULL);
711
- // }
712
- //
713
- // printf("%s: seed = %d\n", __func__, params.seed);
714
- //
715
- // std::mt19937 rng(params.seed);
716
- // if (params.prompt.empty()) {
717
- // params.prompt = gpt_random_prompt(rng);
718
- // }
719
- //
720
- // int64_t t_load_us = 0;
721
- //
722
- // gpt_vocab vocab;
723
- // gpt2_model model;
724
- //
725
- // // load the model
726
- // {
727
- // const int64_t t_start_us = ggml_time_us();
728
- //
729
- // if (!gpt2_model_load(params.model, model, vocab)) {
730
- // fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
731
- // return 1;
732
- // }
733
- //
734
- // t_load_us = ggml_time_us() - t_start_us;
735
- //
736
- // test_gpt_tokenizer(vocab, params.token_test);
737
- // }
738
  //
739
- // while(true) {
740
- // int n_past = 0;
 
 
 
741
  //
742
- // int64_t t_sample_us = 0;
743
- // int64_t t_predict_us = 0;
744
- //
745
- // std::vector<float> logits;
746
- //
747
- // // tokenize the prompt
748
- // std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
749
- //
750
- // params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
751
- //
752
- // printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
753
- // printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
754
- // for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
755
- // printf("%d ", embd_inp[i]);
756
- // }
757
- // printf("\n\n");
758
- //
759
- // // submit the input prompt token-by-token
760
- // // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
761
- // std::vector<gpt_vocab::id> embd;
762
- //
763
- // // determine the required inference memory per token:
764
- // size_t mem_per_token = 0;
765
- // gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
766
- //
767
- // for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
768
- // // predict
769
- // if (embd.size() > 0) {
770
- // const int64_t t_start_us = ggml_time_us();
771
- //
772
- // if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
773
- // printf("Failed to predict\n");
774
- // return 1;
775
- // }
776
- //
777
- // t_predict_us += ggml_time_us() - t_start_us;
778
- // }
779
- //
780
- // n_past += embd.size();
781
- // embd.clear();
782
- //
783
- // if (i >= embd_inp.size()) {
784
- // // sample next token
785
- // const int top_k = params.top_k;
786
- // const float top_p = params.top_p;
787
- // const float temp = params.temp;
788
- //
789
- // const int n_vocab = model.hparams.n_vocab;
790
- //
791
- // gpt_vocab::id id = 0;
792
- //
793
- // {
794
- // const int64_t t_start_sample_us = ggml_time_us();
795
- //
796
- // id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
797
- //
798
- // t_sample_us += ggml_time_us() - t_start_sample_us;
799
- // }
800
- //
801
- // // add it to the context
802
- // embd.push_back(id);
803
- // } else {
804
- // // if here, it means we are still processing the input prompt
805
- // for (size_t k = i; k < embd_inp.size(); k++) {
806
- // embd.push_back(embd_inp[k]);
807
- // if (int32_t(embd.size()) >= params.n_batch) {
808
- // break;
809
- // }
810
- // }
811
- // i += embd.size() - 1;
812
- // }
813
- //
814
- // // display text
815
- // for (auto id : embd) {
816
- // printf("%s", vocab.id_to_token[id].c_str());
817
- // }
818
- // fflush(stdout);
819
- //
820
- // // end of text token
821
- // if (embd.back() == 50256) {
822
- // // report timing
823
- // {
824
- // const int64_t t_main_end_us = ggml_time_us();
825
- //
826
- // printf("\n\n");
827
- // printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
828
- // printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
829
- // printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
830
- // printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
831
- // printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
832
- // }
833
- // break;
834
- // }
835
- // }
836
- // }
837
- //
838
- // ggml_free(model.ctx_w);
839
- //
840
- // return 0;
841
- //}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+
3
+ #include <cassert>
4
+ #include <cmath>
5
+ #include <cstdio>
6
+ #include <cstring>
7
+ #include <fstream>
8
+ #include <map>
9
+ #include <string>
10
+ #include <vector>
11
+ #include <thread>
12
+ #include <ctime>
13
+ #include <random>
14
+ #include <regex>
15
+
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
20
+ // default hparams (GPT-2 117M)
21
+ struct gpt_hparams {
22
+ int32_t n_vocab = 50257; // Vocabulary size remains the same
23
+ //int32_t n_ctx = 1024; // Maximum context length (sequence length)
24
+ int32_t n_embd = 1024; // Embedding dimensionality
25
+ int32_t n_head = 16; // Number of attention heads
26
+ int32_t n_layer = 24; // Number of transformer layers
27
+ int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
28
+ float eps = 1e-5f; // Small constant for numerical stability
29
+
30
+ int32_t seed = -1; // RNG seed
31
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
32
+ int32_t n_predict = 200; // new tokens to predict
33
+ int32_t n_parallel = 1; // number of parallel streams
34
+ int32_t n_batch = 32; // batch size for prompt processing
35
+ int32_t n_ctx = 2048; // context size (this is the KV cache max size)
36
+ int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
37
+
38
+ bool ignore_eos = false; // ignore EOS token when generating text
39
+
40
+ // sampling parameters
41
+ int32_t top_k = 40;
42
+ float top_p = 0.9f;
43
+ float temp = 0.9f;
44
+ int32_t repeat_last_n = 64;
45
+ float repeat_penalty = 1.00f;
46
+
47
+ std::string model = "ggml-model-gpt-2-774M.bin"; // model path
48
+ std::string prompt = "";
49
+ std::string token_test = "";
50
+
51
+ bool interactive = false;
52
+ int32_t interactive_port = -1;
53
+ };
54
+
55
+ struct gpt_vocab {
56
+ using id = int32_t;
57
+ using token = std::string;
58
+
59
+ std::map<token, id> token_to_id;
60
+ std::map<id, token> id_to_token;
61
+ std::vector<std::string> special_tokens;
62
+
63
+ void add_special_token(const std::string & token);
64
+ };
65
+
66
+ struct gpt_layer {
67
+ // normalization
68
+ struct ggml_tensor * ln_1_g;
69
+ struct ggml_tensor * ln_1_b;
70
+
71
+ struct ggml_tensor * ln_2_g;
72
+ struct ggml_tensor * ln_2_b;
73
+
74
+ // attention
75
+ struct ggml_tensor * c_attn_attn_w;
76
+ struct ggml_tensor * c_attn_attn_b;
77
+
78
+ struct ggml_tensor * c_attn_proj_w;
79
+ struct ggml_tensor * c_attn_proj_b;
80
+
81
+ // mlp
82
+ struct ggml_tensor * c_mlp_fc_w;
83
+ struct ggml_tensor * c_mlp_fc_b;
84
+
85
+ struct ggml_tensor * c_mlp_proj_w;
86
+ struct ggml_tensor * c_mlp_proj_b;
87
+ };
88
+
89
+ struct gpt_model {
90
+ gpt_hparams hparams;
91
+
92
+ // normalization
93
+ struct ggml_tensor * ln_f_g;
94
+ struct ggml_tensor * ln_f_b;
95
+
96
+ struct ggml_tensor * wte; // position embedding
97
+ struct ggml_tensor * wpe; // token embedding
98
+ struct ggml_tensor * lm_head; // language model head
99
+
100
+ std::vector<gpt_layer> layers;
101
+
102
+ // key + value memory
103
+ struct ggml_tensor * memory_k;
104
+ struct ggml_tensor * memory_v;
105
+
106
+ //
107
+ struct ggml_context * ctx_w;
108
+ std::map<std::string, struct ggml_tensor *> tensors;
109
+ };
110
+
111
+ // load the model's weights from a file
112
+ bool gpt_model_load(const std::string & fname, gpt_model & model, gpt_vocab & vocab) {
113
+ printf("%s: loading model from '%s'\n", __func__, fname.c_str());
114
+
115
+ auto fin = std::ifstream(fname, std::ios::binary);
116
+ if (!fin) {
117
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
118
+ return false;
119
+ }
120
+
121
+ // verify magic
122
+ {
123
+ uint32_t magic;
124
+ fin.read((char *) &magic, sizeof(magic));
125
+ if (magic != GGML_FILE_MAGIC) {
126
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
127
+ return false;
128
+ }
129
+ }
130
+
131
+ // load hparams
132
+ {
133
+ auto & hparams = model.hparams;
134
+
135
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
136
+ fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
137
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
138
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
139
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
140
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
141
+
142
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
143
+
144
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
145
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
146
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
147
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
148
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
149
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
150
+ printf("%s: qntvr = %d\n", __func__, qntvr);
151
+
152
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
153
+ }
154
+
155
+ // load vocab
156
+ {
157
+ int32_t n_vocab = 0;
158
+ fin.read((char *) &n_vocab, sizeof(n_vocab));
159
+
160
+ if (n_vocab != model.hparams.n_vocab) {
161
+ fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
162
+ __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
163
+ return false;
164
+ }
165
+
166
+ std::string word;
167
+ std::vector<char> buf(128);
168
+
169
+ for (int i = 0; i < n_vocab; i++) {
170
+ uint32_t len;
171
+ fin.read((char *) &len, sizeof(len));
172
+
173
+ buf.resize(len);
174
+ fin.read((char *) buf.data(), len);
175
+ word.assign(buf.data(), len);
176
+
177
+ vocab.token_to_id[word] = i;
178
+ vocab.id_to_token[i] = word;
179
+ }
180
+ }
181
+
182
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
183
+ // in order to save memory and also to speed up the computation
184
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
185
+ if (wtype == GGML_TYPE_COUNT) {
186
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
187
+ __func__, fname.c_str(), model.hparams.ftype);
188
+ return false;
189
+ }
190
+
191
+ auto & ctx = model.ctx_w;
192
+
193
+ size_t ctx_size = 0;
194
+
195
+ {
196
+ const auto & hparams = model.hparams;
197
+
198
+ const int n_embd = hparams.n_embd;
199
+ const int n_layer = hparams.n_layer;
200
+ const int n_ctx = hparams.n_ctx;
201
+ const int n_vocab = hparams.n_vocab;
202
+
203
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
204
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
205
+
206
+ ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
207
+ ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
208
+ ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
209
+
210
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
211
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
212
+
213
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
214
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
215
+
216
+ ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
217
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
218
+
219
+ ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
220
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
221
+
222
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
223
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
224
+
225
+ ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
226
+ ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
227
+
228
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
229
+ ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
230
+
231
+ ctx_size += (6 + 12*n_layer)*512; // object overhead
232
+
233
+ printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
234
+ printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
235
+ }
236
+
237
+ // create the ggml context
238
+ {
239
+ struct ggml_init_params params = {
240
+ /*.mem_size =*/ ctx_size,
241
+ /*.mem_buffer =*/ NULL,
242
+ /*.no_alloc =*/ false,
243
+ };
244
+
245
+ model.ctx_w = ggml_init(params);
246
+ if (!model.ctx_w) {
247
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
248
+ return false;
249
+ }
250
+ }
251
+
252
+ // prepare memory for the weights
253
+ {
254
+ const auto & hparams = model.hparams;
255
+
256
+ const int n_embd = hparams.n_embd;
257
+ const int n_layer = hparams.n_layer;
258
+ const int n_ctx = hparams.n_ctx;
259
+ const int n_vocab = hparams.n_vocab;
260
+
261
+ model.layers.resize(n_layer);
262
+
263
+ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
264
+ model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
265
+
266
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
267
+ model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
268
+ model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
269
+
270
+ // map by name
271
+ model.tensors["model/ln_f/g"] = model.ln_f_g;
272
+ model.tensors["model/ln_f/b"] = model.ln_f_b;
273
+
274
+ model.tensors["model/wte"] = model.wte;
275
+ model.tensors["model/wpe"] = model.wpe;
276
+ model.tensors["model/lm_head"] = model.lm_head;
277
+
278
+ for (int i = 0; i < n_layer; ++i) {
279
+ auto & layer = model.layers[i];
280
+
281
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
282
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
283
+
284
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
285
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
286
+
287
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
288
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
289
+
290
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
291
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
292
+
293
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
294
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
295
+
296
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
297
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
298
+
299
+ // map by name
300
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
301
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
302
+
303
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
304
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
305
+
306
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
307
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
308
+
309
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
310
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
311
+
312
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
313
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
314
+
315
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
316
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
317
+ }
318
+ }
319
+
320
+ // key + value memory
321
+ {
322
+ const auto & hparams = model.hparams;
323
+
324
+ const int n_embd = hparams.n_embd;
325
+ const int n_layer = hparams.n_layer;
326
+ const int n_ctx = hparams.n_ctx;
327
+
328
+ const int n_mem = n_layer*n_ctx;
329
+ const int n_elements = n_embd*n_mem;
330
+
331
+ model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
332
+ model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
333
+
334
+ const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
335
+
336
+ printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
337
+ }
338
+
339
+ // load weights
340
+ {
341
+ size_t total_size = 0;
342
+
343
+ bool has_lm_head = false;
344
+
345
+ while (true) {
346
+ int32_t n_dims;
347
+ int32_t length;
348
+ int32_t ttype;
349
+
350
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
351
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
352
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
353
+
354
+ if (fin.eof()) {
355
+ break;
356
+ }
357
+
358
+ int32_t nelements = 1;
359
+ int32_t ne[2] = { 1, 1 };
360
+ for (int i = 0; i < n_dims; ++i) {
361
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
362
+ nelements *= ne[i];
363
+ }
364
+
365
+ std::string name(length, 0);
366
+ fin.read(&name[0], length);
367
+
368
+ if (model.tensors.find(name) == model.tensors.end()) {
369
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
370
+ return false;
371
+ }
372
+
373
+ auto tensor = model.tensors[name];
374
+ if (ggml_nelements(tensor) != nelements) {
375
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
376
+ return false;
377
+ }
378
+
379
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
380
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
381
+ __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
382
+ return false;
383
+ }
384
+
385
+ // for debugging
386
+ if (0) {
387
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
388
+ }
389
+
390
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
391
+
392
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
393
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
394
+ __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
395
+ return false;
396
+ }
397
+
398
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
399
+
400
+ // GPT-2 models share the WTE tensor as the LM head
401
+ if (name == "model/wte" && has_lm_head == false) {
402
+ memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
403
+ }
404
+
405
+ if (name == "model/lm_head") {
406
+ has_lm_head = true;
407
+ }
408
+
409
+ total_size += ggml_nbytes(tensor);
410
+ }
411
+
412
+ printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
413
+ }
414
+
415
+ fin.close();
416
+
417
+ return true;
418
+ }
419
+
420
+ void gpt_split_words(std::string str, std::vector<std::string>& words) {
421
+ const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
422
+ const std::regex re(pattern);
423
+ std::smatch m;
424
+
425
+ while (std::regex_search(str, m, re)) {
426
+ for (auto x : m) {
427
+ words.push_back(x);
428
+ }
429
+ str = m.suffix();
430
+ }
431
+ }
432
+
433
+ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
434
+ std::vector<std::string> words;
435
+
436
+ // first split the text into words
437
+ {
438
+ std::string str = text;
439
+
440
+ // Generate the subpattern from the special_tokens vector if it's not empty
441
+ if (!vocab.special_tokens.empty()) {
442
+ const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
443
+ std::string special_tokens_subpattern;
444
+ for (const auto & token : vocab.special_tokens) {
445
+ if (!special_tokens_subpattern.empty()) {
446
+ special_tokens_subpattern += "|";
447
+ }
448
+ special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
449
+ }
450
+
451
+ std::regex re(special_tokens_subpattern);
452
+ std::smatch m;
453
+ // Split the text by special tokens.
454
+ while (std::regex_search(str, m, re)) {
455
+ // Split the substrings in-between special tokens into words.
456
+ gpt_split_words(m.prefix(), words);
457
+ // Add matched special tokens as words.
458
+ for (auto x : m) {
459
+ words.push_back(x);
460
+ }
461
+ str = m.suffix();
462
+ }
463
+ // Remaining text without special tokens will be handled below.
464
+ }
465
+
466
+ gpt_split_words(str, words);
467
+ }
468
+
469
+ // find the longest token that forms each word in words:
470
+ std::vector<gpt_vocab::id> tokens;
471
+ for (const auto & word : words) {
472
+ for (int i = 0; i < (int) word.size(); ){
473
+ for (int j = word.size() - 1; j >= i; j--){
474
+ auto cand = word.substr(i, j-i+1);
475
+ auto it = vocab.token_to_id.find(cand);
476
+ if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
477
+ tokens.push_back(it->second);
478
+ i = j + 1;
479
+ break;
480
+ }
481
+ else if (j == i){ // word.substr(i, 1) has no matching
482
+ fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
483
+ i++;
484
+ }
485
+ }
486
+ }
487
+ }
488
+
489
+ return tokens;
490
+ }
491
+
492
+ static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
493
+ std::vector<gpt_vocab::id> output;
494
+ std::stringstream ss(input);
495
+ std::string token;
496
+
497
+ while (std::getline(ss, token, delimiter)) {
498
+ output.push_back(std::stoi(token));
499
+ }
500
+
501
+ return output;
502
+ }
503
+
504
+ static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
505
+ if (fpath_test.empty()){
506
+ fprintf(stderr, "%s : No test file found.\n", __func__);
507
+ return std::map<std::string, std::vector<gpt_vocab::id>>();
508
+ }
509
+
510
+ std::map<std::string, std::vector<gpt_vocab::id>> tests;
511
+
512
+ auto fin = std::ifstream(fpath_test, std::ios_base::in);
513
+ const char * delimeter = " => ";
514
+ const char del_tok = ',';
515
+ std::string line;
516
+ while (std::getline(fin, line)) {
517
+ size_t delimiterPos = line.find(delimeter);
518
+ if (delimiterPos != std::string::npos) {
519
+ std::string text = line.substr(0, delimiterPos);
520
+ std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
521
+ tests[text] = parse_tokens_from_string(s_tokens, del_tok);
522
+ }
523
+ }
524
+ return tests;
525
+ }
526
+
527
+ void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
528
+ std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
529
+
530
+ size_t n_fails = 0;
531
+
532
+ for (const auto & test : tests) {
533
+ std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
534
+
535
+ if (tokens != test.second){
536
+ n_fails++;
537
+
538
+ // print out failure cases
539
+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
540
+ fprintf(stderr, "%s : tokens in hf: ", __func__);
541
+ for (const auto & t : test.second) {
542
+ fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
543
+ }
544
+ fprintf(stderr, "\n");
545
+ fprintf(stderr, "%s : tokens in ggml: ", __func__);
546
+ for (const auto & t : tokens) {
547
+ fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
548
+ }
549
+ fprintf(stderr, "\n");
550
+ }
551
+ }
552
+
553
+ fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
554
+ }
555
+
556
+ gpt_vocab::id gpt_sample_top_k_top_p(
557
+ const gpt_vocab & vocab,
558
+ const float * logits,
559
+ int top_k,
560
+ double top_p,
561
+ double temp,
562
+ std::mt19937 & rng) {
563
+ int n_logits = vocab.id_to_token.size();
564
+
565
+ std::vector<std::pair<double, gpt_vocab::id>> logits_id;
566
+ logits_id.reserve(n_logits);
567
+
568
+ {
569
+ const double scale = 1.0/temp;
570
+ for (int i = 0; i < n_logits; ++i) {
571
+ logits_id.push_back(std::make_pair(logits[i]*scale, i));
572
+ }
573
+ }
574
+
575
+ // find the top K tokens
576
+ std::partial_sort(
577
+ logits_id.begin(),
578
+ logits_id.begin() + top_k, logits_id.end(),
579
+ [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
580
+ return a.first > b.first;
581
+ });
582
+
583
+ logits_id.resize(top_k);
584
+
585
+ double maxl = -INFINITY;
586
+ for (const auto & kv : logits_id) {
587
+ maxl = std::max(maxl, kv.first);
588
+ }
589
+
590
+ // compute probs for the top K tokens
591
+ std::vector<double> probs;
592
+ probs.reserve(logits_id.size());
593
+
594
+ double sum = 0.0;
595
+ for (const auto & kv : logits_id) {
596
+ double p = exp(kv.first - maxl);
597
+ probs.push_back(p);
598
+ sum += p;
599
+ }
600
+
601
+ // normalize the probs
602
+ for (auto & p : probs) {
603
+ p /= sum;
604
+ }
605
+
606
+ if (top_p < 1.0f) {
607
+ double cumsum = 0.0f;
608
+ for (int i = 0; i < top_k; i++) {
609
+ cumsum += probs[i];
610
+ if (cumsum >= top_p) {
611
+ top_k = i + 1;
612
+ probs.resize(top_k);
613
+ logits_id.resize(top_k);
614
+ break;
615
+ }
616
+ }
617
+
618
+ cumsum = 1.0/cumsum;
619
+ for (int i = 0; i < (int) probs.size(); i++) {
620
+ probs[i] *= cumsum;
621
+ }
622
+ }
623
+
624
+ //printf("\n");
625
+ //for (int i = 0; i < (int) probs.size(); i++) {
626
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
627
+ //}
628
+ //exit(0);
629
+
630
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
631
+ int idx = dist(rng);
632
+
633
+ return logits_id[idx].second;
634
+ }
635
+
636
+ // evaluate the transformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  //
638
+ // - model: the model
639
+ // - n_threads: number of threads to use
640
+ // - n_past: the context size so far
641
+ // - embd_inp: the embeddings of the tokens in the context
642
+ // - embd_w: the predicted logits for the next token
643
  //
644
+ bool gpt_eval(
645
+ const gpt_model & model,
646
+ const int n_threads,
647
+ const int n_past,
648
+ const std::vector<gpt_vocab::id> & embd_inp,
649
+ std::vector<float> & embd_w,
650
+ size_t & mem_per_token) {
651
+ const int N = embd_inp.size();
652
+
653
+ const auto & hparams = model.hparams;
654
+
655
+ const int n_embd = hparams.n_embd;
656
+ const int n_layer = hparams.n_layer;
657
+ const int n_ctx = hparams.n_ctx;
658
+ const int n_head = hparams.n_head;
659
+ const int n_vocab = hparams.n_vocab;
660
+
661
+ static size_t buf_size = 256u*1024*1024;
662
+ static void * buf = malloc(buf_size);
663
+
664
+ if (mem_per_token > 0 && mem_per_token*N > buf_size) {
665
+ const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
666
+ //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
667
+
668
+ // reallocate
669
+ buf_size = buf_size_new;
670
+ buf = realloc(buf, buf_size);
671
+ if (buf == nullptr) {
672
+ fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
673
+ return false;
674
+ }
675
+ }
676
+
677
+ struct ggml_init_params params = {
678
+ /*.mem_size =*/ buf_size,
679
+ /*.mem_buffer =*/ buf,
680
+ /*.no_alloc =*/ false,
681
+ };
682
+
683
+ struct ggml_context * ctx0 = ggml_init(params);
684
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
685
+
686
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
687
+ memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
688
+
689
+ struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
690
+ for (int i = 0; i < N; ++i) {
691
+ ((int32_t *) position->data)[i] = n_past + i;
692
+ }
693
+
694
+ // wte + wpe
695
+ struct ggml_tensor * inpL =
696
+ ggml_add(ctx0,
697
+ ggml_get_rows(ctx0, model.wte, embd),
698
+ ggml_get_rows(ctx0, model.wpe, position));
699
+
700
+ for (int il = 0; il < n_layer; ++il) {
701
+ struct ggml_tensor * cur;
702
+
703
+ // norm
704
+ {
705
+ // [ 768, N]
706
+ cur = ggml_norm(ctx0, inpL, hparams.eps);
707
+
708
+ // cur = ln_1_g*cur + ln_1_b
709
+ // [ 768, N]
710
+ cur = ggml_add(ctx0,
711
+ ggml_mul(ctx0,
712
+ ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
713
+ cur),
714
+ ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
715
+ }
716
+
717
+ // attn
718
+ // [2304, 768] - model.layers[il].c_attn_attn_w
719
+ // [2304, 1] - model.layers[il].c_attn_attn_b
720
+ // [ 768, N] - cur (in)
721
+ // [2304, N] - cur (out)
722
+ //
723
+ // cur = attn_w*cur + attn_b
724
+ // [2304, N]
725
+ {
726
+ cur = ggml_mul_mat(ctx0,
727
+ model.layers[il].c_attn_attn_w,
728
+ cur);
729
+
730
+ cur = ggml_add(ctx0,
731
+ ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
732
+ cur);
733
+ }
734
+
735
+ // self-attention
736
+ {
737
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
738
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
739
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
740
+
741
+ // store key and value to memory
742
+ if (N >= 1) {
743
+ struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
744
+ struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
745
+
746
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
747
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
748
+ }
749
+
750
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
751
+ // [64, N, 12]
752
+ struct ggml_tensor * Q =
753
+ ggml_permute(ctx0,
754
+ ggml_cpy(ctx0,
755
+ Qcur,
756
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
757
+ 0, 2, 1, 3);
758
+
759
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
760
+ // [64, n_past + N, 12]
761
+ struct ggml_tensor * K =
762
+ ggml_permute(ctx0,
763
+ ggml_reshape_3d(ctx0,
764
+ ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
765
+ n_embd/n_head, n_head, n_past + N),
766
+ 0, 2, 1, 3);
767
+
768
+ // GG: flash attention
769
+ //struct ggml_tensor * V =
770
+ // ggml_cpy(ctx0,
771
+ // ggml_permute(ctx0,
772
+ // ggml_reshape_3d(ctx0,
773
+ // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
774
+ // n_embd/n_head, n_head, n_past + N),
775
+ // 1, 2, 0, 3),
776
+ // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
777
+
778
+ //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
779
+
780
+ // K * Q
781
+ // [n_past + N, N, 12]
782
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
783
+
784
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
785
+ // [n_past + N, N, 12]
786
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
787
+
788
+ // KQ_masked = mask_past(KQ_scaled)
789
+ // [n_past + N, N, 12]
790
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
791
+
792
+ // KQ = soft_max(KQ_masked)
793
+ // [n_past + N, N, 12]
794
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
795
+
796
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
797
+ // [n_past + N, 64, 12]
798
+ struct ggml_tensor * V_trans =
799
+ ggml_cpy(ctx0,
800
+ ggml_permute(ctx0,
801
+ ggml_reshape_3d(ctx0,
802
+ ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
803
+ n_embd/n_head, n_head, n_past + N),
804
+ 1, 2, 0, 3),
805
+ ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
806
+
807
+ // KQV = transpose(V) * KQ_soft_max
808
+ // [64, N, 12]
809
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
810
+
811
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
812
+ // [64, 12, N]
813
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
814
+
815
+ // cur = KQV_merged.contiguous().view(n_embd, N)
816
+ // [768, N]
817
+ cur = ggml_cpy(ctx0,
818
+ KQV_merged,
819
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
820
+ }
821
+
822
+ // projection
823
+ // [ 768, 768] - model.layers[il].c_attn_proj_w
824
+ // [ 768, 1] - model.layers[il].c_attn_proj_b
825
+ // [ 768, N] - cur (in)
826
+ // [ 768, N] - cur (out)
827
+ //
828
+ // cur = proj_w*cur + proj_b
829
+ // [768, N]
830
+ {
831
+ cur = ggml_mul_mat(ctx0,
832
+ model.layers[il].c_attn_proj_w,
833
+ cur);
834
+
835
+ cur = ggml_add(ctx0,
836
+ ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
837
+ cur);
838
+ }
839
+
840
+ // add the input
841
+ cur = ggml_add(ctx0, cur, inpL);
842
+
843
+ struct ggml_tensor * inpFF = cur;
844
+
845
+ // feed-forward network
846
+ {
847
+ // norm
848
+ {
849
+ cur = ggml_norm(ctx0, inpFF, hparams.eps);
850
+
851
+ // cur = ln_2_g*cur + ln_2_b
852
+ // [ 768, N]
853
+ cur = ggml_add(ctx0,
854
+ ggml_mul(ctx0,
855
+ ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
856
+ cur),
857
+ ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
858
+ }
859
+
860
+ // fully connected
861
+ // [3072, 768] - model.layers[il].c_mlp_fc_w
862
+ // [3072, 1] - model.layers[il].c_mlp_fc_b
863
+ // [ 768, N] - cur (in)
864
+ // [3072, N] - cur (out)
865
+ //
866
+ // cur = fc_w*cur + fc_b
867
+ // [3072, N]
868
+ cur = ggml_mul_mat(ctx0,
869
+ model.layers[il].c_mlp_fc_w,
870
+ cur);
871
+
872
+ cur = ggml_add(ctx0,
873
+ ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
874
+ cur);
875
+
876
+ // GELU activation
877
+ // [3072, N]
878
+ cur = ggml_gelu(ctx0, cur);
879
+
880
+ // projection
881
+ // [ 768, 3072] - model.layers[il].c_mlp_proj_w
882
+ // [ 768, 1] - model.layers[il].c_mlp_proj_b
883
+ // [3072, N] - cur (in)
884
+ // [ 768, N] - cur (out)
885
+ //
886
+ // cur = proj_w*cur + proj_b
887
+ // [768, N]
888
+ cur = ggml_mul_mat(ctx0,
889
+ model.layers[il].c_mlp_proj_w,
890
+ cur);
891
+
892
+ cur = ggml_add(ctx0,
893
+ ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
894
+ cur);
895
+ }
896
+
897
+ // input for next layer
898
+ inpL = ggml_add(ctx0, cur, inpFF);
899
+ }
900
+
901
+ // norm
902
+ {
903
+ // [ 768, N]
904
+ inpL = ggml_norm(ctx0, inpL, hparams.eps);
905
+
906
+ // inpL = ln_f_g*inpL + ln_f_b
907
+ // [ 768, N]
908
+ inpL = ggml_add(ctx0,
909
+ ggml_mul(ctx0,
910
+ ggml_repeat(ctx0, model.ln_f_g, inpL),
911
+ inpL),
912
+ ggml_repeat(ctx0, model.ln_f_b, inpL));
913
+ }
914
+
915
+ // inpL = WTE * inpL
916
+ // [ 768, 50257] - model.lm_head
917
+ // [ 768, N] - inpL
918
+ inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
919
+
920
+ // logits -> probs
921
+ //inpL = ggml_soft_max_inplace(ctx0, inpL);
922
+
923
+ // run the computation
924
+ ggml_build_forward_expand(gf, inpL);
925
+ ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
926
+
927
+ //if (n_past%100 == 0) {
928
+ // ggml_graph_print (&gf);
929
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
930
+ //}
931
+
932
+ //embd_w.resize(n_vocab*N);
933
+ //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
934
+
935
+ // return result just for the last token
936
+ embd_w.resize(n_vocab);
937
+ memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
938
+
939
+ if (mem_per_token == 0) {
940
+ mem_per_token = ggml_used_mem(ctx0)/N;
941
+ }
942
+ //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
943
+
944
+ ggml_free(ctx0);
945
+
946
+ return true;
947
+ }
948
+
949
+ void gpt_print_usage(int argc, char ** argv, const gpt_hparams & params) {
950
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
951
+ fprintf(stderr, "\n");
952
+ fprintf(stderr, "options:\n");
953
+ fprintf(stderr, " -h, --help show this help message and exit\n");
954
+ fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
955
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
956
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
957
+ fprintf(stderr, " prompt to start generation with (default: random)\n");
958
+ fprintf(stderr, " -f FNAME, --file FNAME\n");
959
+ fprintf(stderr, " load prompt from a file\n");
960
+ fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
961
+ fprintf(stderr, " test tokenization\n");
962
+ fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
963
+ fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
964
+ fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
965
+ fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
966
+ fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
967
+ fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
968
+ fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
969
+ fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
970
+ fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
971
+ fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
972
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
973
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
974
+ fprintf(stderr, "\n");
975
+ }
976
+
977
+ // Function to check if the next argument exists
978
+ static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_hparams& params) {
979
+ if (i + 1 < argc && argv[i + 1][0] != '-') {
980
+ return argv[++i];
981
+ } else {
982
+ fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
983
+ gpt_print_usage(argc, argv, params);
984
+ exit(0);
985
+ }
986
+ }
987
+
988
+ bool gpt_params_parse(int argc, char ** argv, gpt_hparams & params) {
989
+ for (int i = 1; i < argc; i++) {
990
+ std::string arg = argv[i];
991
+
992
+ if (arg == "-s" || arg == "--seed") {
993
+ params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
994
+ } else if (arg == "-t" || arg == "--threads") {
995
+ params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
996
+ } else if (arg == "-p" || arg == "--prompt") {
997
+ params.prompt = get_next_arg(i, argc, argv, arg, params);
998
+ } else if (arg == "-n" || arg == "--n_predict") {
999
+ params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
1000
+ } else if (arg == "-np" || arg == "--n_parallel") {
1001
+ params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
1002
+ } else if (arg == "--top_k") {
1003
+ params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
1004
+ } else if (arg == "--top_p") {
1005
+ params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
1006
+ } else if (arg == "--temp") {
1007
+ params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
1008
+ } else if (arg == "--repeat-last-n") {
1009
+ params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
1010
+ } else if (arg == "--repeat-penalty") {
1011
+ params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
1012
+ } else if (arg == "-b" || arg == "--batch_size") {
1013
+ params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
1014
+ } else if (arg == "-c" || arg == "--context") {
1015
+ params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
1016
+ } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
1017
+ params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
1018
+ } else if (arg == "--ignore-eos") {
1019
+ params.ignore_eos = true;
1020
+ } else if (arg == "-m" || arg == "--model") {
1021
+ params.model = get_next_arg(i, argc, argv, arg, params);
1022
+ } else if (arg == "-i" || arg == "--interactive") {
1023
+ params.interactive = true;
1024
+ } else if (arg == "-ip" || arg == "--interactive-port") {
1025
+ params.interactive = true;
1026
+ params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
1027
+ } else if (arg == "-h" || arg == "--help") {
1028
+ gpt_print_usage(argc, argv, params);
1029
+ exit(0);
1030
+ } else if (arg == "-f" || arg == "--file") {
1031
+ get_next_arg(i, argc, argv, arg, params);
1032
+ std::ifstream file(argv[i]);
1033
+ if (!file) {
1034
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1035
+ break;
1036
+ }
1037
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
1038
+ if (params.prompt.back() == '\n') {
1039
+ params.prompt.pop_back();
1040
+ }
1041
+ } else if (arg == "-tt" || arg == "--token_test") {
1042
+ params.token_test = get_next_arg(i, argc, argv, arg, params);
1043
+ }
1044
+ else {
1045
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
1046
+ gpt_print_usage(argc, argv, params);
1047
+ exit(0);
1048
+ }
1049
+ }
1050
+
1051
+ return true;
1052
+ }
1053
+
1054
+ std::string gpt_random_prompt(std::mt19937 & rng) {
1055
+ const int r = rng() % 10;
1056
+ switch (r) {
1057
+ case 0: return "So";
1058
+ case 1: return "Once upon a time";
1059
+ case 2: return "When";
1060
+ case 3: return "The";
1061
+ case 4: return "After";
1062
+ case 5: return "If";
1063
+ case 6: return "import";
1064
+ case 7: return "He";
1065
+ case 8: return "She";
1066
+ case 9: return "They";
1067
+ }
1068
+
1069
+ return "The";
1070
+ }
1071
+
1072
+ int main(int argc, char ** argv) {
1073
+ ggml_time_init();
1074
+
1075
+ const int64_t t_main_start_us = ggml_time_us();
1076
+
1077
+ gpt_hparams params;
1078
+
1079
+ if (gpt_params_parse(argc, argv, params) == false) {
1080
+ return 1;
1081
+ }
1082
+
1083
+ if (params.seed < 0) {
1084
+ params.seed = time(NULL);
1085
+ }
1086
+
1087
+ printf("%s: seed = %d\n", __func__, params.seed);
1088
+
1089
+ std::mt19937 rng(params.seed);
1090
+ if (params.prompt.empty()) {
1091
+ params.prompt = gpt_random_prompt(rng);
1092
+ }
1093
+
1094
+ int64_t t_load_us = 0;
1095
+
1096
+ gpt_vocab vocab;
1097
+ gpt_model model;
1098
+
1099
+ // load the model
1100
+ {
1101
+ const int64_t t_start_us = ggml_time_us();
1102
+
1103
+ if (!gpt_model_load(params.model, model, vocab)) {
1104
+ fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
1105
+ return 1;
1106
+ }
1107
+
1108
+ t_load_us = ggml_time_us() - t_start_us;
1109
+
1110
+ test_gpt_tokenizer(vocab, params.token_test);
1111
+ }
1112
+
1113
+ while(true) {
1114
+ int n_past = 0;
1115
+
1116
+ int64_t t_sample_us = 0;
1117
+ int64_t t_predict_us = 0;
1118
+
1119
+ std::vector<float> logits;
1120
+
1121
+ // tokenize the prompt
1122
+ std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
1123
+
1124
+ params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
1125
+
1126
+ printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
1127
+ printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
1128
+ for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
1129
+ printf("%d ", embd_inp[i]);
1130
+ }
1131
+ printf("\n\n");
1132
+
1133
+ // submit the input prompt token-by-token
1134
+ // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
1135
+ std::vector<gpt_vocab::id> embd;
1136
+
1137
+ // determine the required inference memory per token:
1138
+ size_t mem_per_token = 0;
1139
+ gpt_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
1140
+
1141
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
1142
+ // predict
1143
+ if (embd.size() > 0) {
1144
+ const int64_t t_start_us = ggml_time_us();
1145
+
1146
+ if (!gpt_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
1147
+ printf("Failed to predict\n");
1148
+ return 1;
1149
+ }
1150
+
1151
+ t_predict_us += ggml_time_us() - t_start_us;
1152
+ }
1153
+
1154
+ n_past += embd.size();
1155
+ embd.clear();
1156
+
1157
+ if (i >= embd_inp.size()) {
1158
+ // sample next token
1159
+ const int top_k = params.top_k;
1160
+ const float top_p = params.top_p;
1161
+ const float temp = params.temp;
1162
+
1163
+ const int n_vocab = model.hparams.n_vocab;
1164
+
1165
+ gpt_vocab::id id = 0;
1166
+
1167
+ {
1168
+ const int64_t t_start_sample_us = ggml_time_us();
1169
+
1170
+ id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
1171
+
1172
+ t_sample_us += ggml_time_us() - t_start_sample_us;
1173
+ }
1174
+
1175
+ // add it to the context
1176
+ embd.push_back(id);
1177
+ } else {
1178
+ // if here, it means we are still processing the input prompt
1179
+ for (size_t k = i; k < embd_inp.size(); k++) {
1180
+ embd.push_back(embd_inp[k]);
1181
+ if (int32_t(embd.size()) >= params.n_batch) {
1182
+ break;
1183
+ }
1184
+ }
1185
+ i += embd.size() - 1;
1186
+ }
1187
+
1188
+ // display text
1189
+ for (auto id : embd) {
1190
+ printf("%s", vocab.id_to_token[id].c_str());
1191
+ }
1192
+ fflush(stdout);
1193
+
1194
+ // end of text token
1195
+ if (embd.back() == 50256) {
1196
+ // report timing
1197
+ {
1198
+ const int64_t t_main_end_us = ggml_time_us();
1199
+
1200
+ printf("\n\n");
1201
+ printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
1202
+ printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
1203
+ printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
1204
+ printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
1205
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
1206
+ }
1207
+ break;
1208
+ }
1209
+ }
1210
+ }
1211
+
1212
+ ggml_free(model.ctx_w);
1213
+
1214
+ return 0;
1215
+ }