-
-
Save twobob/8f00f184bf3e9d89b70ba58156ac14d1 to your computer and use it in GitHub Desktop.
| /* | |
| Inference for Llama-2 Transformer model in pure C. | |
| This version will spit out story blocks as fast as possible to a folder called inbox | |
| Metrics are shown per story, no doubt this could be faster. | |
| Output using -03 and no -fopenmp, with token-by-token reporting on the test machine gave 6-8 tok/s second. | |
| Compiling as outlined below and foregoing constant screen output nets between 80-330 tok/s on the same machine. | |
| So between 10 - 55 times faster. | |
| Example compile: (see README for more details) | |
| $ gcc -o run run_blocks.c -lm -fopenmp -Wall -Wpedantic -Wformat=2 -Wcast-align -Wnull-dereference -g3 -Ofast | |
| Then run with: | |
| $ ./run | |
| */ | |
| #include <conio.h> | |
| #include <direct.h> // for creating directory | |
| #include <math.h> | |
| #include <stdint.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <time.h> | |
| #include <windows.h> | |
| #define MALLOC(ptr, size) \ | |
| do { \ | |
| ptr = malloc(size); \ | |
| if (!ptr) { \ | |
| printf("Failed to allocate memory.\n"); \ | |
| exit(EXIT_FAILURE); \ | |
| } \ | |
| } while(0) | |
| #ifndef MULTI_THREADED | |
| #define MULTI_THREADED 0 | |
| #endif | |
| int fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream) { | |
| size_t result = fread(ptr, size, nmemb, stream); | |
| if (result != nmemb) { | |
| printf("Error reading file.\n"); | |
| return 1; | |
| } | |
| return 0; | |
| } | |
| //#define MULTI_THREADED | |
| #ifdef MULTI_THREADED | |
| #include <omp.h> | |
| #endif | |
| #define CHUNK_BY_STORY | |
| // ---------------------------------------------------------------------------- | |
| // Transformer and RunState structs, and related memory management | |
| typedef struct { | |
| int dim; // transformer dimension | |
| int hidden_dim; // for ffn layers | |
| int n_layers; // number of layers | |
| int n_heads; // number of query heads | |
| int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery) | |
| int vocab_size; // vocabulary size, usually 256 (byte-level) | |
| int seq_len; // max sequence length | |
| } Config; | |
| typedef struct { | |
| // token embedding table | |
| float* token_embedding_table; // (vocab_size, dim) | |
| // weights for rmsnorms | |
| float* rms_att_weight; // (layer, dim) rmsnorm weights | |
| float* rms_ffn_weight; // (layer, dim) | |
| // weights for matmuls | |
| float* wq; // (layer, dim, dim) | |
| float* wk; // (layer, dim, dim) | |
| float* wv; // (layer, dim, dim) | |
| float* wo; // (layer, dim, dim) | |
| // weights for ffn | |
| float* w1; // (layer, hidden_dim, dim) | |
| float* w2; // (layer, dim, hidden_dim) | |
| float* w3; // (layer, hidden_dim, dim) | |
| // final rmsnorm | |
| float* rms_final_weight; // (dim,) | |
| // freq_cis for RoPE relatively positional embeddings | |
| float* freq_cis_real; // (seq_len, dim/2) | |
| float* freq_cis_imag; // (seq_len, dim/2) | |
| } TransformerWeights; | |
| typedef struct { | |
| // current wave of activations | |
| float *x; // activation at current time stamp (dim,) | |
| float *xb; // same, but inside a residual branch (dim,) | |
| float *xb2; // an additional buffer just for convenience (dim,) | |
| float *hb; // buffer for hidden dimension in the ffn (hidden_dim,) | |
| float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,) | |
| float *q; // query (dim,) | |
| float *k; // key (dim,) | |
| float *v; // value (dim,) | |
| float *att; // buffer for scores/attention values (n_heads, seq_len) | |
| float *logits; // output logits | |
| // kv cache | |
| float* key_cache; // (layer, seq_len, dim) | |
| float* value_cache; // (layer, seq_len, dim) | |
| } RunState; | |
| void malloc_run_state(RunState* s, Config* p) { | |
| // we calloc instead of malloc to keep valgrind happy | |
| s->x = calloc(p->dim, sizeof(float)); | |
| s->xb = calloc(p->dim, sizeof(float)); | |
| s->xb2 = calloc(p->dim, sizeof(float)); | |
| s->hb = calloc(p->hidden_dim, sizeof(float)); | |
| s->hb2 = calloc(p->hidden_dim, sizeof(float)); | |
| s->q = calloc(p->dim, sizeof(float)); | |
| s->k = calloc(p->dim, sizeof(float)); | |
| s->v = calloc(p->dim, sizeof(float)); | |
| s->att = calloc(p->n_heads * p->seq_len, sizeof(float)); | |
| s->logits = calloc(p->vocab_size, sizeof(float)); | |
| s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float)); | |
| s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float)); | |
| // ensure all mallocs went fine | |
| if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q | |
| || !s->k || !s->v || !s->att || !s->logits || !s->key_cache | |
| || !s->value_cache) { | |
| printf("malloc failed!\n"); | |
| exit(1); | |
| } | |
| } | |
| void free_run_state(RunState* s) { | |
| free(s->x); | |
| free(s->xb); | |
| free(s->xb2); | |
| free(s->hb); | |
| free(s->hb2); | |
| free(s->q); | |
| free(s->k); | |
| free(s->v); | |
| free(s->att); | |
| free(s->logits); | |
| free(s->key_cache); | |
| free(s->value_cache); | |
| } | |
| void malloc_weights(TransformerWeights* w, Config* p) { | |
| // we calloc instead of malloc to keep valgrind happy | |
| w->token_embedding_table = calloc(p->vocab_size * p->dim, sizeof(float)); | |
| w->rms_att_weight = calloc(p->n_layers * p->dim, sizeof(float)); | |
| w->rms_ffn_weight = calloc(p->n_layers * p->dim, sizeof(float)); | |
| w->wq = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); | |
| w->wk = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); | |
| w->wv = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); | |
| w->wo = calloc(p->n_layers * p->dim * p->dim, sizeof(float)); | |
| w->w1 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float)); | |
| w->w2 = calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float)); | |
| w->w3 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float)); | |
| w->rms_final_weight = calloc(p->dim, sizeof(float)); | |
| w->freq_cis_real = calloc(p->seq_len * p->dim / 2, sizeof(float)); | |
| w->freq_cis_imag = calloc(p->seq_len * p->dim / 2, sizeof(float)); | |
| // ensure all mallocs went fine | |
| if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight | |
| || !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 || | |
| !w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) { | |
| printf("malloc failed!\n"); | |
| exit(1); | |
| } | |
| } | |
| void free_weights(TransformerWeights* w) { | |
| free(w->token_embedding_table); | |
| free(w->rms_att_weight); | |
| free(w->rms_ffn_weight); | |
| free(w->wq); | |
| free(w->wk); | |
| free(w->wv); | |
| free(w->wo); | |
| free(w->w1); | |
| free(w->w2); | |
| free(w->w3); | |
| free(w->rms_final_weight); | |
| free(w->freq_cis_real); | |
| free(w->freq_cis_imag); | |
| } | |
| // ---------------------------------------------------------------------------- | |
| // initialization: read from checkpoint | |
| int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) { | |
| if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != p->vocab_size * p->dim) return 1; | |
| if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != p->n_layers * p->dim) return 1; | |
| if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1; | |
| if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1; | |
| if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1; | |
| if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1; | |
| if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != p->n_layers * p->dim) return 1; | |
| if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != p->n_layers * p->dim * p->hidden_dim) return 1; | |
| if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != p->n_layers * p->hidden_dim * p->dim) return 1; | |
| if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != p->n_layers * p->dim * p->hidden_dim) return 1; | |
| if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != p->dim) return 1; | |
| int head_size = p->dim / p->n_heads; | |
| if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != p->seq_len * head_size / 2) return 1; | |
| if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != p->seq_len * head_size / 2) return 1; | |
| return 0; | |
| } | |
| // ---------------------------------------------------------------------------- | |
| // neural net blocks | |
| void accum(float *a, float *b, int size) { | |
| for (int i = 0; i < size; i++) { | |
| a[i] += b[i]; | |
| } | |
| } | |
| void rmsnorm(float* o, float* x, float* weight, int size) { | |
| // calculate sum of squares | |
| float ss = 0.0f; | |
| for (int j = 0; j < size; j++) { | |
| ss += x[j] * x[j]; | |
| } | |
| ss /= size; | |
| ss += 1e-5f; | |
| ss = 1.0f / sqrt(ss); | |
| // normalize and scale | |
| for (int j = 0; j < size; j++) { | |
| o[j] = weight[j] * (ss * x[j]); | |
| } | |
| } | |
| void softmax(float* x, int size) { | |
| // find max value (for numerical stability) | |
| float max_val = x[0]; | |
| for (int i = 1; i < size; i++) { | |
| if (x[i] > max_val) { | |
| max_val = x[i]; | |
| } | |
| } | |
| // exp and sum | |
| float sum = 0.0f; | |
| for (int i = 0; i < size; i++) { | |
| x[i] = exp(x[i] - max_val); | |
| sum += x[i]; | |
| } | |
| // normalize | |
| for (int i = 0; i < size; i++) { | |
| x[i] /= sum; | |
| } | |
| } | |
| void matmul(float* xout, float* x, float* w, int n, int d) { | |
| // W (d,n) @ x (n,) -> xout (d,) | |
| #ifdef MULTI_THREADED | |
| #pragma omp parallel for | |
| #endif | |
| for (int i = 0; i < d; i++) { | |
| float val = 0.0f; | |
| for (int j = 0; j < n; j++) { | |
| val += w[i * n + j] * x[j]; | |
| } | |
| xout[i] = val; | |
| } | |
| } | |
| void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) { | |
| // a few convenience variables | |
| float *x = s->x; | |
| int dim = p->dim; | |
| int hidden_dim = p->hidden_dim; | |
| int head_size = dim / p->n_heads; | |
| // copy the token embedding into x | |
| float* content_row = &(w->token_embedding_table[token * dim]); | |
| memcpy(x, content_row, dim*sizeof(*x)); | |
| // pluck out the "pos" row of freq_cis_real and freq_cis_imag | |
| float* freq_cis_real_row = w->freq_cis_real + pos * head_size / 2; | |
| float* freq_cis_imag_row = w->freq_cis_imag + pos * head_size / 2; | |
| // forward all the layers | |
| for(int l = 0; l < p->n_layers; l++) { | |
| // attention rmsnorm | |
| rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim); | |
| // qkv matmuls for this position | |
| matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim); | |
| matmul(s->k, s->xb, w->wk + l*dim*dim, dim, dim); | |
| matmul(s->v, s->xb, w->wv + l*dim*dim, dim, dim); | |
| // apply RoPE rotation to the q and k vectors for each head | |
| for (int h = 0; h < p->n_heads; h++) { | |
| // get the q and k vectors for this head | |
| float* q = s->q + h * head_size; | |
| float* k = s->k + h * head_size; | |
| // rotate q and k by the freq_cis_real and freq_cis_imag | |
| for (int i = 0; i < head_size; i+=2) { | |
| float q0 = q[i]; | |
| float q1 = q[i+1]; | |
| float k0 = k[i]; | |
| float k1 = k[i+1]; | |
| float fcr = freq_cis_real_row[i/2]; | |
| float fci = freq_cis_imag_row[i/2]; | |
| q[i] = q0 * fcr - q1 * fci; | |
| q[i+1] = q0 * fci + q1 * fcr; | |
| k[i] = k0 * fcr - k1 * fci; | |
| k[i+1] = k0 * fci + k1 * fcr; | |
| } | |
| } | |
| // save key,value at this time step (pos) to our kv cache | |
| int loff = l * p->seq_len * dim; // kv cache layer offset for convenience | |
| float* key_cache_row = s->key_cache + loff + pos * dim; | |
| float* value_cache_row = s->value_cache + loff + pos * dim; | |
| memcpy(key_cache_row, s->k, dim*sizeof(*key_cache_row)); | |
| memcpy(value_cache_row, s->v, dim*sizeof(*value_cache_row)); | |
| // multihead attention. iterate over all heads | |
| #ifdef MULTI_THREADED | |
| #pragma omp parallel for | |
| #endif | |
| for (int h = 0; h < p->n_heads; h++) { | |
| // get the query vector for this head | |
| float* q = s->q + h * head_size; | |
| // attention scores for this head | |
| float* att = s->att + h * p->seq_len; | |
| // iterate over all timesteps, including the current one | |
| for (int t = 0; t <= pos; t++) { | |
| // get the key vector for this head and at this timestep | |
| float* k = s->key_cache + loff + t * dim + h * head_size; | |
| // calculate the attention score as the dot product of q and k | |
| float score = 0.0f; | |
| for (int i = 0; i < head_size; i++) { | |
| score += q[i] * k[i]; | |
| } | |
| score /= sqrtf(head_size); | |
| // save the score to the attention buffer | |
| att[t] = score; | |
| } | |
| // softmax the scores to get attention weights, from 0..pos inclusively | |
| softmax(att, pos + 1); | |
| // weighted sum of the values, store back into xb | |
| for (int i = 0; i < head_size; i++) { | |
| float val = 0.0f; | |
| for (int t = 0; t <= pos; t++) { | |
| val += att[t] * s->value_cache[loff + t * dim + h * head_size + i]; // note bad locality | |
| } | |
| s->xb[h * head_size + i] = val; | |
| } | |
| } | |
| // final matmul to get the output of the attention | |
| matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim); | |
| // residual connection back into x | |
| accum(x, s->xb2, dim); | |
| // ffn rmsnorm | |
| rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim); | |
| // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x)) | |
| // first calculate self.w1(x) and self.w3(x) | |
| matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim); | |
| matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim); | |
| // F.silu; silu(x)=x*σ(x),where σ(x) is the logistic sigmoid | |
| for (int i = 0; i < hidden_dim; i++) { | |
| s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i]))); | |
| } | |
| // elementwise multiply with w3(x) | |
| for (int i = 0; i < hidden_dim; i++) { | |
| s->hb[i] = s->hb[i] * s->hb2[i]; | |
| } | |
| // final matmul to get the output of the ffn | |
| matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim); | |
| // residual connection | |
| accum(x, s->xb, dim); | |
| } | |
| // final rmsnorm | |
| rmsnorm(x, x, w->rms_final_weight, dim); | |
| // classifier into logits | |
| matmul(s->logits, x, w->token_embedding_table, p->dim, p->vocab_size); | |
| } | |
| int sample(float* probabilities, int n) { | |
| // sample index from probabilities, they must sum to 1 | |
| float r = (float)rand() / (float)RAND_MAX; | |
| //float r = random_float; // / (float)RAND_MAX; | |
| float cdf = 0.0f; | |
| for (int i = 0; i < n; i++) { | |
| cdf += probabilities[i]; | |
| if (r < cdf) { | |
| return i; | |
| } | |
| } | |
| return n - 1; // in case of rounding errors | |
| } | |
| int argmax(float* v, int n) { | |
| // return argmax of v in elements 0..n | |
| int max_i = 0; | |
| float max_p = v[0]; | |
| for (int i = 1; i < n; i++) { | |
| if (v[i] > max_p) { | |
| max_i = i; | |
| max_p = v[i]; | |
| } | |
| } | |
| return max_i; | |
| } | |
| // ---------------------------------------------------------------------------- | |
| long long time_in_ms() { | |
| FILETIME filetime; | |
| GetSystemTimeAsFileTime(&filetime); | |
| // A FILETIME contains a 64-bit value representing the number of 100-nanosecond intervals since 1601-01-01T00:00:00Z. | |
| ULARGE_INTEGER large_integer; | |
| large_integer.LowPart = filetime.dwLowDateTime; | |
| large_integer.HighPart = filetime.dwHighDateTime; | |
| ULONGLONG time = large_integer.QuadPart; | |
| // Convert to milliseconds from 100-nanoseconds intervals | |
| time /= 10000; | |
| // Subtract the epoch (1601-01-01T00:00:00Z in milliseconds) | |
| // The epoch in FILETIME format is 11644473600000 milliseconds | |
| time -= 11644473600000; | |
| return (long long)time; | |
| } | |
| int main(int argc, char *argv[]) { | |
| char *checkpoint = NULL; | |
| float temperature = 0.0f; | |
| time_t current_time; | |
| Config config; | |
| TransformerWeights weights; | |
| char** vocab; | |
| if (argc < 2) { | |
| printf("Usage: %s <checkpoint_file> [temperature] [seed]\\n", argv[0]); | |
| return 1; | |
| } | |
| checkpoint = argv[1]; | |
| if (argc >= 3) { | |
| temperature = atof(argv[2]); | |
| } | |
| if (argc >= 4) { | |
| unsigned int seed = atoi(argv[3]); | |
| srand(seed); | |
| } else { | |
| time(¤t_time); | |
| srand((unsigned int)current_time); | |
| } | |
| if (_mkdir("inbox") == -1 && errno != EEXIST) { | |
| printf("Error creating directory 'inbox'!\\n"); | |
| return 1; | |
| } | |
| // Setup config, weights, and vocab outside of loop | |
| FILE *file = fopen(checkpoint, "rb"); | |
| if (!file) { | |
| printf("Unable to open the checkpoint file %s!\\n", checkpoint); | |
| return 1; | |
| } | |
| if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } | |
| malloc_weights(&weights, &config); | |
| if(checkpoint_init_weights(&weights, &config, file)) { return 1; } | |
| fclose(file); | |
| MALLOC(vocab, config.vocab_size * sizeof(char*)); | |
| file = fopen("tokenizer.bin", "rb"); | |
| if (!file) { | |
| printf("Unable to open the tokenizer file tokenizer.bin! Run python tokenizer.py to convert tokenizer.model -> tokenizer.bin\\n"); | |
| return 1; | |
| } | |
| int len; | |
| for (int i = 0; i < config.vocab_size; i++) { | |
| if(fread(&len, sizeof(int), 1, file) != 1) { return 1; } | |
| MALLOC(vocab[i], len + 1); | |
| if(fread(vocab[i], len, 1, file) != 1) { return 1; } | |
| vocab[i][len] = '\0'; | |
| } | |
| fclose(file); | |
| RunState state; | |
| malloc_run_state(&state, &config); | |
| while (!_kbhit()) { | |
| time(¤t_time); | |
| srand((unsigned int)current_time); | |
| long start = time_in_ms(); | |
| int next; | |
| int token = 1; | |
| int pos = 0; | |
| time_t t = time(NULL); | |
| struct tm *tm_info = localtime(&t); | |
| char timestamp[26]; | |
| strftime(timestamp, 26, "%Y%m%d%H%M%S", tm_info); | |
| char filename[50]; | |
| snprintf(filename, 50, "inbox/%s.txt", timestamp); | |
| FILE *output_file = fopen(filename, "w"); | |
| if (!output_file) { | |
| printf("Unable to open the inbox file %s!\\n", filename); | |
| return 1; | |
| } | |
| #define MAX_STRING_SIZE 2000 | |
| char tokens_so_far[MAX_STRING_SIZE] = ""; | |
| tokens_so_far[0] = '\0'; | |
| while (pos < config.seq_len) { | |
| transformer(token, pos, &config, &state, &weights); | |
| if(temperature == 0.0f) { | |
| next = argmax(state.logits, config.vocab_size); | |
| } else { | |
| for (int q=0; q<config.vocab_size; q++) { state.logits[q] /= temperature; } | |
| softmax(state.logits, config.vocab_size); | |
| next = sample(state.logits, config.vocab_size); | |
| } | |
| if (strlen(tokens_so_far) + strlen(vocab[next]) < MAX_STRING_SIZE) { | |
| if (next > 1) { | |
| strcat(tokens_so_far, vocab[next]); | |
| } | |
| } | |
| if (next < 2) { | |
| fprintf(output_file, "%s", tokens_so_far); | |
| memset(tokens_so_far, 0, sizeof(tokens_so_far)); | |
| time(¤t_time); | |
| srand((unsigned int)current_time); | |
| pos = 0; | |
| fclose(output_file); | |
| break; | |
| } | |
| token = next; | |
| pos++; | |
| } | |
| long end = time_in_ms(); | |
| printf("achieved tok/s: %04.1f for %s \n", config.seq_len / (double)(end-start)*1000, filename); | |
| fflush(stdout); | |
| } | |
| free_run_state(&state); | |
| free_weights(&weights); | |
| for (int i = 0; i < config.vocab_size; i++) { free(vocab[i]); } | |
| free(vocab); | |
| return 0; | |
| } |
Once upon a time, there was a jolly clown named Bobo. Bobo loved to make people laugh and smile. He wore a big red nose and had colorful feathers.
One day, Bobo went to a party where a group of children were playing. They were having fun and making a lot of noise. But then, Bobo got a call on his phone. It was time to go
328.9 tok/s is around a 55 x speedup over vanilla on the test box. The average is a circa 20 x speedup.
run_factored ../out/model44m.bin 1 23458
achieved tok/s: 197.2 for inbox/20230728042127.txt
achieved tok/s: 171.5 for inbox/20230728042132.txt
achieved tok/s: 168.0 for inbox/20230728042138.txt
achieved tok/s: 120.0 for inbox/20230728042144.txt
achieved tok/s: 138.0 for inbox/20230728042153.txt
achieved tok/s: 117.9 for inbox/20230728042200.txt
achieved tok/s: 182.3 for inbox/20230728042209.txt
achieved tok/s: 279.1 for inbox/20230728042215.txt
achieved tok/s: 159.9 for inbox/20230728042218.txt
achieved tok/s: 95.0 for inbox/20230728042225.txt
achieved tok/s: 180.4 for inbox/20230728042235.txt
achieved tok/s: 328.9 for inbox/20230728042241.txt
achieved tok/s: 168.9 for inbox/20230728042244.txt
achieved tok/s: 150.6 for inbox/20230728042250.txt
achieved tok/s: 155.7 for inbox/20230728042257.txt
achieved tok/s: 158.4 for inbox/20230728042304.txt
achieved tok/s: 164.8 for inbox/20230728042310.txt
achieved tok/s: 150.4 for inbox/20230728042316.txt
achieved tok/s: 169.4 for inbox/20230728042323.txt
achieved tok/s: 125.4 for inbox/20230728042329.txt
achieved tok/s: 108.1 for inbox/20230728042337.txt
achieved tok/s: 158.4 for inbox/20230728042347.txt
achieved tok/s: 88.9 for inbox/20230728042353.txt
achieved tok/s: 105.8 for inbox/20230728042405.txt
achieved tok/s: 149.4 for inbox/20230728042415.txt
achieved tok/s: 155.3 for inbox/20230728042421.txt
achieved tok/s: 104.3 for inbox/20230728042428.txt
achieved tok/s: 187.8 for inbox/20230728042438.txt
achieved tok/s: 86.8 for inbox/20230728042443.txt
achieved tok/s: 226.7 for inbox/20230728042455.txt
achieved tok/s: 160.0 for inbox/20230728042500.txt
achieved tok/s: 137.3 for inbox/20230728042506.txt
achieved tok/s: 153.3 for inbox/20230728042514.txt
achieved tok/s: 176.7 for inbox/20230728042520.txt
achieved tok/s: 207.6 for inbox/20230728042526.txt
achieved tok/s: 159.9 for inbox/20230728042531.txt
achieved tok/s: 160.3 for inbox/20230728042537.txt
achieved tok/s: 137.7 for inbox/20230728042544.txt
achieved tok/s: 176.8 for inbox/20230728042551.txt
achieved tok/s: 153.2 for inbox/20230728042600.txt
achieved tok/s: 152.9 for inbox/20230728042607.txt
achieved tok/s: 179.4 for inbox/20230728042614.txt
Once upon a time, there was a long cat. The cat had a smooth home. In the home, there was a long string.
One day, the cat went outside to play. The cat chased the ball and had fun. When the cat came back home, it was sad.
The cat looked and looked, and then it found the string. The cat played with the string all day. The cat was very happy. And the long string was its best toy.
Best output. 299 tok/s using the 44million model.
Around a 50x speedup over the vanilla
The complete results run is listed below for transparency.
run_factored ../out/model44m.bin 1 23457
achieved tok/s: 174.8 for inbox/20230728035328.txt
achieved tok/s: 103.0 for inbox/20230728035334.txt
achieved tok/s: 205.8 for inbox/20230728035344.txt
achieved tok/s: 165.0 for inbox/20230728035349.txt
achieved tok/s: 114.6 for inbox/20230728035355.txt
achieved tok/s: 111.3 for inbox/20230728035404.txt
achieved tok/s: 180.1 for inbox/20230728035413.txt
achieved tok/s: 154.6 for inbox/20230728035419.txt
achieved tok/s: 152.0 for inbox/20230728035425.txt
achieved tok/s: 187.7 for inbox/20230728035432.txt
achieved tok/s: 135.8 for inbox/20230728035437.txt
achieved tok/s: 156.2 for inbox/20230728035445.txt
achieved tok/s: 167.8 for inbox/20230728035452.txt
achieved tok/s: 165.2 for inbox/20230728035458.txt
achieved tok/s: 124.8 for inbox/20230728035504.txt
achieved tok/s: 151.6 for inbox/20230728035512.txt
achieved tok/s: 126.9 for inbox/20230728035519.txt
achieved tok/s: 164.3 for inbox/20230728035527.txt
achieved tok/s: 127.1 for inbox/20230728035533.txt
achieved tok/s: 195.1 for inbox/20230728035541.txt
achieved tok/s: 164.5 for inbox/20230728035547.txt
achieved tok/s: 158.6 for inbox/20230728035553.txt
achieved tok/s: 169.2 for inbox/20230728035559.txt
achieved tok/s: 157.7 for inbox/20230728035605.txt
achieved tok/s: 174.6 for inbox/20230728035612.txt
achieved tok/s: 134.0 for inbox/20230728035618.txt
achieved tok/s: 137.9 for inbox/20230728035625.txt
achieved tok/s: 246.1 for inbox/20230728035633.txt
achieved tok/s: 123.3 for inbox/20230728035637.txt
achieved tok/s: 146.2 for inbox/20230728035645.txt
achieved tok/s: 116.7 for inbox/20230728035652.txt
achieved tok/s: 168.7 for inbox/20230728035701.txt
achieved tok/s: 299.8 for inbox/20230728035707.txt
achieved tok/s: 153.9 for inbox/20230728035711.txt
achieved tok/s: 192.6 for inbox/20230728035717.txt
achieved tok/s: 150.2 for inbox/20230728035723.txt
achieved tok/s: 196.2 for inbox/20230728035729.txt
achieved tok/s: 125.7 for inbox/20230728035735.txt
achieved tok/s: 173.0 for inbox/20230728035743.txt
achieved tok/s: 168.9 for inbox/20230728035749.txt
achieved tok/s: 128.3 for inbox/20230728035755.txt
achieved tok/s: 160.4 for inbox/20230728035803.txt
achieved tok/s: 131.6 for inbox/20230728035809.txt
achieved tok/s: 154.8 for inbox/20230728035817.txt
achieved tok/s: 163.5 for inbox/20230728035824.txt
achieved tok/s: 129.7 for inbox/20230728035830.txt
achieved tok/s: 147.4 for inbox/20230728035838.txt
achieved tok/s: 144.1 for inbox/20230728035845.txt
achieved tok/s: 180.5 for inbox/20230728035852.txt
achieved tok/s: 85.5 for inbox/20230728035857.txt
achieved tok/s: 174.0 for inbox/20230728035909.txt
achieved tok/s: 131.2 for inbox/20230728035915.txt
achieved tok/s: 229.4 for inbox/20230728035923.txt
achieved tok/s: 230.1 for inbox/20230728035928.txt
achieved tok/s: 184.7 for inbox/20230728035932.txt
achieved tok/s: 122.5 for inbox/20230728035938.txt
achieved tok/s: 154.7 for inbox/20230728035946.txt
achieved tok/s: 165.0 for inbox/20230728035953.txt
achieved tok/s: 132.4 for inbox/20230728035959.txt
achieved tok/s: 141.4 for inbox/20230728040007.txt
achieved tok/s: 117.3 for inbox/20230728040014.txt
achieved tok/s: 207.2 for inbox/20230728040023.txt
achieved tok/s: 151.8 for inbox/20230728040027.txt
achieved tok/s: 126.8 for inbox/20230728040034.txt
achieved tok/s: 180.3 for inbox/20230728040042.txt
achieved tok/s: 117.2 for inbox/20230728040048.txt
achieved tok/s: 98.2 for inbox/20230728040057.txt
achieved tok/s: 104.3 for inbox/20230728040107.txt
achieved tok/s: 135.4 for inbox/20230728040117.txt
achieved tok/s: 174.3 for inbox/20230728040125.txt
achieved tok/s: 129.3 for inbox/20230728040130.txt
achieved tok/s: 217.7 for inbox/20230728040138.txt
achieved tok/s: 184.7 for inbox/20230728040143.txt
achieved tok/s: 197.6 for inbox/20230728040149.txt
achieved tok/s: 157.3 for inbox/20230728040154.txt
achieved tok/s: 215.4 for inbox/20230728040200.txt
achieved tok/s: 160.6 for inbox/20230728040205.txt
achieved tok/s: 147.6 for inbox/20230728040211.txt
achieved tok/s: 169.9 for inbox/20230728040218.txt
achieved tok/s: 143.2 for inbox/20230728040224.txt
achieved tok/s: 143.1 for inbox/20230728040232.txt
achieved tok/s: 125.0 for inbox/20230728040239.txt
achieved tok/s: 128.1 for inbox/20230728040247.txt
achieved tok/s: 162.4 for inbox/20230728040255.txt
achieved tok/s: 146.3 for inbox/20230728040301.txt