Skip to content

Instantly share code, notes, and snippets.

@twobob
Last active July 28, 2023 03:31
Show Gist options
  • Save twobob/8f00f184bf3e9d89b70ba58156ac14d1 to your computer and use it in GitHub Desktop.
Save twobob/8f00f184bf3e9d89b70ba58156ac14d1 to your computer and use it in GitHub Desktop.
This version will spit out story blocks as timestamped .txt files, as fast as possible, to a folder named inbox WINDOWS VERSION YMMV
/*
Inference for Llama-2 Transformer model in pure C.
This version will spit out story blocks as fast as possible to a folder called inbox
Metrics are shown per story, no doubt this could be faster.
Output using -03 and no -fopenmp, with token-by-token reporting on the test machine gave 6-8 tok/s second.
Compiling as outlined below and foregoing constant screen output nets between 80-330 tok/s on the same machine.
So between 10 - 55 times faster.
Example compile: (see README for more details)
$ gcc -o run run_blocks.c -lm -fopenmp -Wall -Wpedantic -Wformat=2 -Wcast-align -Wnull-dereference -g3 -Ofast
Then run with:
$ ./run
*/
#include <conio.h>
#include <direct.h> // for creating directory
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <windows.h>
#define MALLOC(ptr, size) \
do { \
ptr = malloc(size); \
if (!ptr) { \
printf("Failed to allocate memory.\n"); \
exit(EXIT_FAILURE); \
} \
} while(0)
#ifndef MULTI_THREADED
#define MULTI_THREADED 0
#endif
int fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t result = fread(ptr, size, nmemb, stream);
if (result != nmemb) {
printf("Error reading file.\n");
return 1;
}
return 0;
}
//#define MULTI_THREADED
#ifdef MULTI_THREADED
#include <omp.h>
#endif
#define CHUNK_BY_STORY
// ----------------------------------------------------------------------------
// Transformer and RunState structs, and related memory management
typedef struct {
int dim; // transformer dimension
int hidden_dim; // for ffn layers
int n_layers; // number of layers
int n_heads; // number of query heads
int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
int vocab_size; // vocabulary size, usually 256 (byte-level)
int seq_len; // max sequence length
} Config;
typedef struct {
// token embedding table
float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
float* rms_att_weight; // (layer, dim) rmsnorm weights
float* rms_ffn_weight; // (layer, dim)
// weights for matmuls
float* wq; // (layer, dim, dim)
float* wk; // (layer, dim, dim)
float* wv; // (layer, dim, dim)
float* wo; // (layer, dim, dim)
// weights for ffn
float* w1; // (layer, hidden_dim, dim)
float* w2; // (layer, dim, hidden_dim)
float* w3; // (layer, hidden_dim, dim)
// final rmsnorm
float* rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
float* freq_cis_real; // (seq_len, dim/2)
float* freq_cis_imag; // (seq_len, dim/2)
} TransformerWeights;
typedef struct {
// current wave of activations
float *x; // activation at current time stamp (dim,)
float *xb; // same, but inside a residual branch (dim,)
float *xb2; // an additional buffer just for convenience (dim,)
float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)
float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)
float *q; // query (dim,)
float *k; // key (dim,)
float *v; // value (dim,)
float *att; // buffer for scores/attention values (n_heads, seq_len)
float *logits; // output logits
// kv cache
float* key_cache; // (layer, seq_len, dim)
float* value_cache; // (layer, seq_len, dim)
} RunState;
void malloc_run_state(RunState* s, Config* p) {
// we calloc instead of malloc to keep valgrind happy
s->x = calloc(p->dim, sizeof(float));
s->xb = calloc(p->dim, sizeof(float));
s->xb2 = calloc(p->dim, sizeof(float));
s->hb = calloc(p->hidden_dim, sizeof(float));
s->hb2 = calloc(p->hidden_dim, sizeof(float));
s->q = calloc(p->dim, sizeof(float));
s->k = calloc(p->dim, sizeof(float));
s->v = calloc(p->dim, sizeof(float));
s->att = calloc(p->n_heads * p->seq_len, sizeof(float));
s->logits = calloc(p->vocab_size, sizeof(float));
s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
// ensure all mallocs went fine
if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
|| !s->k || !s->v || !s->att || !s->logits || !s->key_cache
|| !s->value_cache) {
printf("malloc failed!\n");
exit(1);
}
}
void free_run_state(RunState* s) {
free(s->x);
free(s->xb);
free(s->xb2);
free(s->hb);
free(s->hb2);
free(s->q);
free(s->k);
free(s->v);
free(s->att);
free(s->logits);
free(s->key_cache);
free(s->value_cache);
}
void malloc_weights(TransformerWeights* w, Config* p) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = calloc(p->vocab_size * p->dim, sizeof(float));
w->rms_att_weight = calloc(p->n_layers * p->dim, sizeof(float));
w->rms_ffn_weight = calloc(p->n_layers * p->dim, sizeof(float));
w->wq = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
w->wk = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
w->wv = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
w->wo = calloc(p->n_layers * p->dim * p->dim, sizeof(float));
w->w1 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
w->w2 = calloc(p->n_layers * p->dim * p->hidden_dim, sizeof(float));
w->w3 = calloc(p->n_layers * p->hidden_dim * p->dim, sizeof(float));
w->rms_final_weight = calloc(p->dim, sizeof(float));
w->freq_cis_real = calloc(p->seq_len * p->dim / 2, sizeof(float));
w->freq_cis_imag = calloc(p->seq_len * p->dim / 2, sizeof(float));
// ensure all mallocs went fine
if (!w->token_embedding_table || !w->rms_att_weight || !w->rms_ffn_weight
|| !w->wq || !w->wk || !w->wv || !w->wo || !w->w1 || !w->w2 || !w->w3 ||
!w->rms_final_weight || !w->freq_cis_real || !w->freq_cis_imag) {
printf("malloc failed!\n");
exit(1);
}
}
void free_weights(TransformerWeights* w) {
free(w->token_embedding_table);
free(w->rms_att_weight);
free(w->rms_ffn_weight);
free(w->wq);
free(w->wk);
free(w->wv);
free(w->wo);
free(w->w1);
free(w->w2);
free(w->w3);
free(w->rms_final_weight);
free(w->freq_cis_real);
free(w->freq_cis_imag);
}
// ----------------------------------------------------------------------------
// initialization: read from checkpoint
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != p->vocab_size * p->dim) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != p->n_layers * p->dim) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != p->n_layers * p->dim * p->dim) return 1;
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != p->n_layers * p->dim) return 1;
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != p->n_layers * p->dim * p->hidden_dim) return 1;
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != p->n_layers * p->hidden_dim * p->dim) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != p->n_layers * p->dim * p->hidden_dim) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != p->dim) return 1;
int head_size = p->dim / p->n_heads;
if (fread(w->freq_cis_real, sizeof(float), p->seq_len * head_size / 2, f) != p->seq_len * head_size / 2) return 1;
if (fread(w->freq_cis_imag, sizeof(float), p->seq_len * head_size / 2, f) != p->seq_len * head_size / 2) return 1;
return 0;
}
// ----------------------------------------------------------------------------
// neural net blocks
void accum(float *a, float *b, int size) {
for (int i = 0; i < size; i++) {
a[i] += b[i];
}
}
void rmsnorm(float* o, float* x, float* weight, int size) {
// calculate sum of squares
float ss = 0.0f;
for (int j = 0; j < size; j++) {
ss += x[j] * x[j];
}
ss /= size;
ss += 1e-5f;
ss = 1.0f / sqrt(ss);
// normalize and scale
for (int j = 0; j < size; j++) {
o[j] = weight[j] * (ss * x[j]);
}
}
void softmax(float* x, int size) {
// find max value (for numerical stability)
float max_val = x[0];
for (int i = 1; i < size; i++) {
if (x[i] > max_val) {
max_val = x[i];
}
}
// exp and sum
float sum = 0.0f;
for (int i = 0; i < size; i++) {
x[i] = exp(x[i] - max_val);
sum += x[i];
}
// normalize
for (int i = 0; i < size; i++) {
x[i] /= sum;
}
}
void matmul(float* xout, float* x, float* w, int n, int d) {
// W (d,n) @ x (n,) -> xout (d,)
#ifdef MULTI_THREADED
#pragma omp parallel for
#endif
for (int i = 0; i < d; i++) {
float val = 0.0f;
for (int j = 0; j < n; j++) {
val += w[i * n + j] * x[j];
}
xout[i] = val;
}
}
void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) {
// a few convenience variables
float *x = s->x;
int dim = p->dim;
int hidden_dim = p->hidden_dim;
int head_size = dim / p->n_heads;
// copy the token embedding into x
float* content_row = &(w->token_embedding_table[token * dim]);
memcpy(x, content_row, dim*sizeof(*x));
// pluck out the "pos" row of freq_cis_real and freq_cis_imag
float* freq_cis_real_row = w->freq_cis_real + pos * head_size / 2;
float* freq_cis_imag_row = w->freq_cis_imag + pos * head_size / 2;
// forward all the layers
for(int l = 0; l < p->n_layers; l++) {
// attention rmsnorm
rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
// qkv matmuls for this position
matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
matmul(s->k, s->xb, w->wk + l*dim*dim, dim, dim);
matmul(s->v, s->xb, w->wv + l*dim*dim, dim, dim);
// apply RoPE rotation to the q and k vectors for each head
for (int h = 0; h < p->n_heads; h++) {
// get the q and k vectors for this head
float* q = s->q + h * head_size;
float* k = s->k + h * head_size;
// rotate q and k by the freq_cis_real and freq_cis_imag
for (int i = 0; i < head_size; i+=2) {
float q0 = q[i];
float q1 = q[i+1];
float k0 = k[i];
float k1 = k[i+1];
float fcr = freq_cis_real_row[i/2];
float fci = freq_cis_imag_row[i/2];
q[i] = q0 * fcr - q1 * fci;
q[i+1] = q0 * fci + q1 * fcr;
k[i] = k0 * fcr - k1 * fci;
k[i+1] = k0 * fci + k1 * fcr;
}
}
// save key,value at this time step (pos) to our kv cache
int loff = l * p->seq_len * dim; // kv cache layer offset for convenience
float* key_cache_row = s->key_cache + loff + pos * dim;
float* value_cache_row = s->value_cache + loff + pos * dim;
memcpy(key_cache_row, s->k, dim*sizeof(*key_cache_row));
memcpy(value_cache_row, s->v, dim*sizeof(*value_cache_row));
// multihead attention. iterate over all heads
#ifdef MULTI_THREADED
#pragma omp parallel for
#endif
for (int h = 0; h < p->n_heads; h++) {
// get the query vector for this head
float* q = s->q + h * head_size;
// attention scores for this head
float* att = s->att + h * p->seq_len;
// iterate over all timesteps, including the current one
for (int t = 0; t <= pos; t++) {
// get the key vector for this head and at this timestep
float* k = s->key_cache + loff + t * dim + h * head_size;
// calculate the attention score as the dot product of q and k
float score = 0.0f;
for (int i = 0; i < head_size; i++) {
score += q[i] * k[i];
}
score /= sqrtf(head_size);
// save the score to the attention buffer
att[t] = score;
}
// softmax the scores to get attention weights, from 0..pos inclusively
softmax(att, pos + 1);
// weighted sum of the values, store back into xb
for (int i = 0; i < head_size; i++) {
float val = 0.0f;
for (int t = 0; t <= pos; t++) {
val += att[t] * s->value_cache[loff + t * dim + h * head_size + i]; // note bad locality
}
s->xb[h * head_size + i] = val;
}
}
// final matmul to get the output of the attention
matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);
// residual connection back into x
accum(x, s->xb2, dim);
// ffn rmsnorm
rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);
// Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))
// first calculate self.w1(x) and self.w3(x)
matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
// F.silu; silu(x)=x*σ(x),where σ(x) is the logistic sigmoid
for (int i = 0; i < hidden_dim; i++) {
s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i])));
}
// elementwise multiply with w3(x)
for (int i = 0; i < hidden_dim; i++) {
s->hb[i] = s->hb[i] * s->hb2[i];
}
// final matmul to get the output of the ffn
matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);
// residual connection
accum(x, s->xb, dim);
}
// final rmsnorm
rmsnorm(x, x, w->rms_final_weight, dim);
// classifier into logits
matmul(s->logits, x, w->token_embedding_table, p->dim, p->vocab_size);
}
int sample(float* probabilities, int n) {
// sample index from probabilities, they must sum to 1
float r = (float)rand() / (float)RAND_MAX;
//float r = random_float; // / (float)RAND_MAX;
float cdf = 0.0f;
for (int i = 0; i < n; i++) {
cdf += probabilities[i];
if (r < cdf) {
return i;
}
}
return n - 1; // in case of rounding errors
}
int argmax(float* v, int n) {
// return argmax of v in elements 0..n
int max_i = 0;
float max_p = v[0];
for (int i = 1; i < n; i++) {
if (v[i] > max_p) {
max_i = i;
max_p = v[i];
}
}
return max_i;
}
// ----------------------------------------------------------------------------
long long time_in_ms() {
FILETIME filetime;
GetSystemTimeAsFileTime(&filetime);
// A FILETIME contains a 64-bit value representing the number of 100-nanosecond intervals since 1601-01-01T00:00:00Z.
ULARGE_INTEGER large_integer;
large_integer.LowPart = filetime.dwLowDateTime;
large_integer.HighPart = filetime.dwHighDateTime;
ULONGLONG time = large_integer.QuadPart;
// Convert to milliseconds from 100-nanoseconds intervals
time /= 10000;
// Subtract the epoch (1601-01-01T00:00:00Z in milliseconds)
// The epoch in FILETIME format is 11644473600000 milliseconds
time -= 11644473600000;
return (long long)time;
}
int main(int argc, char *argv[]) {
char *checkpoint = NULL;
float temperature = 0.0f;
time_t current_time;
Config config;
TransformerWeights weights;
char** vocab;
if (argc < 2) {
printf("Usage: %s <checkpoint_file> [temperature] [seed]\\n", argv[0]);
return 1;
}
checkpoint = argv[1];
if (argc >= 3) {
temperature = atof(argv[2]);
}
if (argc >= 4) {
unsigned int seed = atoi(argv[3]);
srand(seed);
} else {
time(&current_time);
srand((unsigned int)current_time);
}
if (_mkdir("inbox") == -1 && errno != EEXIST) {
printf("Error creating directory 'inbox'!\\n");
return 1;
}
// Setup config, weights, and vocab outside of loop
FILE *file = fopen(checkpoint, "rb");
if (!file) {
printf("Unable to open the checkpoint file %s!\\n", checkpoint);
return 1;
}
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
malloc_weights(&weights, &config);
if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
fclose(file);
MALLOC(vocab, config.vocab_size * sizeof(char*));
file = fopen("tokenizer.bin", "rb");
if (!file) {
printf("Unable to open the tokenizer file tokenizer.bin! Run python tokenizer.py to convert tokenizer.model -> tokenizer.bin\\n");
return 1;
}
int len;
for (int i = 0; i < config.vocab_size; i++) {
if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
MALLOC(vocab[i], len + 1);
if(fread(vocab[i], len, 1, file) != 1) { return 1; }
vocab[i][len] = '\0';
}
fclose(file);
RunState state;
malloc_run_state(&state, &config);
while (!_kbhit()) {
time(&current_time);
srand((unsigned int)current_time);
long start = time_in_ms();
int next;
int token = 1;
int pos = 0;
time_t t = time(NULL);
struct tm *tm_info = localtime(&t);
char timestamp[26];
strftime(timestamp, 26, "%Y%m%d%H%M%S", tm_info);
char filename[50];
snprintf(filename, 50, "inbox/%s.txt", timestamp);
FILE *output_file = fopen(filename, "w");
if (!output_file) {
printf("Unable to open the inbox file %s!\\n", filename);
return 1;
}
#define MAX_STRING_SIZE 2000
char tokens_so_far[MAX_STRING_SIZE] = "";
tokens_so_far[0] = '\0';
while (pos < config.seq_len) {
transformer(token, pos, &config, &state, &weights);
if(temperature == 0.0f) {
next = argmax(state.logits, config.vocab_size);
} else {
for (int q=0; q<config.vocab_size; q++) { state.logits[q] /= temperature; }
softmax(state.logits, config.vocab_size);
next = sample(state.logits, config.vocab_size);
}
if (strlen(tokens_so_far) + strlen(vocab[next]) < MAX_STRING_SIZE) {
if (next > 1) {
strcat(tokens_so_far, vocab[next]);
}
}
if (next < 2) {
fprintf(output_file, "%s", tokens_so_far);
memset(tokens_so_far, 0, sizeof(tokens_so_far));
time(&current_time);
srand((unsigned int)current_time);
pos = 0;
fclose(output_file);
break;
}
token = next;
pos++;
}
long end = time_in_ms();
printf("achieved tok/s: %04.1f for %s \n", config.seq_len / (double)(end-start)*1000, filename);
fflush(stdout);
}
free_run_state(&state);
free_weights(&weights);
for (int i = 0; i < config.vocab_size; i++) { free(vocab[i]); }
free(vocab);
return 0;
}
@twobob
Copy link
Author

twobob commented Jul 28, 2023

Once upon a time, there was a long cat. The cat had a smooth home. In the home, there was a long string.
One day, the cat went outside to play. The cat chased the ball and had fun. When the cat came back home, it was sad.
The cat looked and looked, and then it found the string. The cat played with the string all day. The cat was very happy. And the long string was its best toy.

Best output. 299 tok/s using the 44million model.

Around a 50x speedup over the vanilla

The complete results run is listed below for transparency.
run_factored ../out/model44m.bin 1 23457
achieved tok/s: 174.8 for inbox/20230728035328.txt
achieved tok/s: 103.0 for inbox/20230728035334.txt
achieved tok/s: 205.8 for inbox/20230728035344.txt
achieved tok/s: 165.0 for inbox/20230728035349.txt
achieved tok/s: 114.6 for inbox/20230728035355.txt
achieved tok/s: 111.3 for inbox/20230728035404.txt
achieved tok/s: 180.1 for inbox/20230728035413.txt
achieved tok/s: 154.6 for inbox/20230728035419.txt
achieved tok/s: 152.0 for inbox/20230728035425.txt
achieved tok/s: 187.7 for inbox/20230728035432.txt
achieved tok/s: 135.8 for inbox/20230728035437.txt
achieved tok/s: 156.2 for inbox/20230728035445.txt
achieved tok/s: 167.8 for inbox/20230728035452.txt
achieved tok/s: 165.2 for inbox/20230728035458.txt
achieved tok/s: 124.8 for inbox/20230728035504.txt
achieved tok/s: 151.6 for inbox/20230728035512.txt
achieved tok/s: 126.9 for inbox/20230728035519.txt
achieved tok/s: 164.3 for inbox/20230728035527.txt
achieved tok/s: 127.1 for inbox/20230728035533.txt
achieved tok/s: 195.1 for inbox/20230728035541.txt
achieved tok/s: 164.5 for inbox/20230728035547.txt
achieved tok/s: 158.6 for inbox/20230728035553.txt
achieved tok/s: 169.2 for inbox/20230728035559.txt
achieved tok/s: 157.7 for inbox/20230728035605.txt
achieved tok/s: 174.6 for inbox/20230728035612.txt
achieved tok/s: 134.0 for inbox/20230728035618.txt
achieved tok/s: 137.9 for inbox/20230728035625.txt
achieved tok/s: 246.1 for inbox/20230728035633.txt
achieved tok/s: 123.3 for inbox/20230728035637.txt
achieved tok/s: 146.2 for inbox/20230728035645.txt
achieved tok/s: 116.7 for inbox/20230728035652.txt
achieved tok/s: 168.7 for inbox/20230728035701.txt
achieved tok/s: 299.8 for inbox/20230728035707.txt
achieved tok/s: 153.9 for inbox/20230728035711.txt
achieved tok/s: 192.6 for inbox/20230728035717.txt
achieved tok/s: 150.2 for inbox/20230728035723.txt
achieved tok/s: 196.2 for inbox/20230728035729.txt
achieved tok/s: 125.7 for inbox/20230728035735.txt
achieved tok/s: 173.0 for inbox/20230728035743.txt
achieved tok/s: 168.9 for inbox/20230728035749.txt
achieved tok/s: 128.3 for inbox/20230728035755.txt
achieved tok/s: 160.4 for inbox/20230728035803.txt
achieved tok/s: 131.6 for inbox/20230728035809.txt
achieved tok/s: 154.8 for inbox/20230728035817.txt
achieved tok/s: 163.5 for inbox/20230728035824.txt
achieved tok/s: 129.7 for inbox/20230728035830.txt
achieved tok/s: 147.4 for inbox/20230728035838.txt
achieved tok/s: 144.1 for inbox/20230728035845.txt
achieved tok/s: 180.5 for inbox/20230728035852.txt
achieved tok/s: 85.5 for inbox/20230728035857.txt
achieved tok/s: 174.0 for inbox/20230728035909.txt
achieved tok/s: 131.2 for inbox/20230728035915.txt
achieved tok/s: 229.4 for inbox/20230728035923.txt
achieved tok/s: 230.1 for inbox/20230728035928.txt
achieved tok/s: 184.7 for inbox/20230728035932.txt
achieved tok/s: 122.5 for inbox/20230728035938.txt
achieved tok/s: 154.7 for inbox/20230728035946.txt
achieved tok/s: 165.0 for inbox/20230728035953.txt
achieved tok/s: 132.4 for inbox/20230728035959.txt
achieved tok/s: 141.4 for inbox/20230728040007.txt
achieved tok/s: 117.3 for inbox/20230728040014.txt
achieved tok/s: 207.2 for inbox/20230728040023.txt
achieved tok/s: 151.8 for inbox/20230728040027.txt
achieved tok/s: 126.8 for inbox/20230728040034.txt
achieved tok/s: 180.3 for inbox/20230728040042.txt
achieved tok/s: 117.2 for inbox/20230728040048.txt
achieved tok/s: 98.2 for inbox/20230728040057.txt
achieved tok/s: 104.3 for inbox/20230728040107.txt
achieved tok/s: 135.4 for inbox/20230728040117.txt
achieved tok/s: 174.3 for inbox/20230728040125.txt
achieved tok/s: 129.3 for inbox/20230728040130.txt
achieved tok/s: 217.7 for inbox/20230728040138.txt
achieved tok/s: 184.7 for inbox/20230728040143.txt
achieved tok/s: 197.6 for inbox/20230728040149.txt
achieved tok/s: 157.3 for inbox/20230728040154.txt
achieved tok/s: 215.4 for inbox/20230728040200.txt
achieved tok/s: 160.6 for inbox/20230728040205.txt
achieved tok/s: 147.6 for inbox/20230728040211.txt
achieved tok/s: 169.9 for inbox/20230728040218.txt
achieved tok/s: 143.2 for inbox/20230728040224.txt
achieved tok/s: 143.1 for inbox/20230728040232.txt
achieved tok/s: 125.0 for inbox/20230728040239.txt
achieved tok/s: 128.1 for inbox/20230728040247.txt
achieved tok/s: 162.4 for inbox/20230728040255.txt
achieved tok/s: 146.3 for inbox/20230728040301.txt

@twobob
Copy link
Author

twobob commented Jul 28, 2023

Once upon a time, there was a jolly clown named Bobo. Bobo loved to make people laugh and smile. He wore a big red nose and had colorful feathers.
One day, Bobo went to a party where a group of children were playing. They were having fun and making a lot of noise. But then, Bobo got a call on his phone. It was time to go

328.9 tok/s is around a 55 x speedup over vanilla on the test box. The average is a circa 20 x speedup.

run_factored ../out/model44m.bin 1 23458
achieved tok/s: 197.2 for inbox/20230728042127.txt
achieved tok/s: 171.5 for inbox/20230728042132.txt
achieved tok/s: 168.0 for inbox/20230728042138.txt
achieved tok/s: 120.0 for inbox/20230728042144.txt
achieved tok/s: 138.0 for inbox/20230728042153.txt
achieved tok/s: 117.9 for inbox/20230728042200.txt
achieved tok/s: 182.3 for inbox/20230728042209.txt
achieved tok/s: 279.1 for inbox/20230728042215.txt
achieved tok/s: 159.9 for inbox/20230728042218.txt
achieved tok/s: 95.0 for inbox/20230728042225.txt
achieved tok/s: 180.4 for inbox/20230728042235.txt
achieved tok/s: 328.9 for inbox/20230728042241.txt
achieved tok/s: 168.9 for inbox/20230728042244.txt
achieved tok/s: 150.6 for inbox/20230728042250.txt
achieved tok/s: 155.7 for inbox/20230728042257.txt
achieved tok/s: 158.4 for inbox/20230728042304.txt
achieved tok/s: 164.8 for inbox/20230728042310.txt
achieved tok/s: 150.4 for inbox/20230728042316.txt
achieved tok/s: 169.4 for inbox/20230728042323.txt
achieved tok/s: 125.4 for inbox/20230728042329.txt
achieved tok/s: 108.1 for inbox/20230728042337.txt
achieved tok/s: 158.4 for inbox/20230728042347.txt
achieved tok/s: 88.9 for inbox/20230728042353.txt
achieved tok/s: 105.8 for inbox/20230728042405.txt
achieved tok/s: 149.4 for inbox/20230728042415.txt
achieved tok/s: 155.3 for inbox/20230728042421.txt
achieved tok/s: 104.3 for inbox/20230728042428.txt
achieved tok/s: 187.8 for inbox/20230728042438.txt
achieved tok/s: 86.8 for inbox/20230728042443.txt
achieved tok/s: 226.7 for inbox/20230728042455.txt
achieved tok/s: 160.0 for inbox/20230728042500.txt
achieved tok/s: 137.3 for inbox/20230728042506.txt
achieved tok/s: 153.3 for inbox/20230728042514.txt
achieved tok/s: 176.7 for inbox/20230728042520.txt
achieved tok/s: 207.6 for inbox/20230728042526.txt
achieved tok/s: 159.9 for inbox/20230728042531.txt
achieved tok/s: 160.3 for inbox/20230728042537.txt
achieved tok/s: 137.7 for inbox/20230728042544.txt
achieved tok/s: 176.8 for inbox/20230728042551.txt
achieved tok/s: 153.2 for inbox/20230728042600.txt
achieved tok/s: 152.9 for inbox/20230728042607.txt
achieved tok/s: 179.4 for inbox/20230728042614.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment