Skip to content

Instantly share code, notes, and snippets.

@MurageKibicho
Created March 27, 2025 19:29
Show Gist options
  • Save MurageKibicho/540c3dd87541b80cfb6f00c266ec9655 to your computer and use it in GitHub Desktop.
Save MurageKibicho/540c3dd87541b80cfb6f00c266ec9655 to your computer and use it in GitHub Desktop.
Got to attention, just before creating function at 6.14.13
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <sys/mman.h>
#include <stdbool.h>
#include "cJSON.h"
#define VOCABULARY_SIZE 50257
#define tf_d_vocab 50257
#define tf_d_seq 1024
#define tf_d_model 768
#define tf_d_k 64
#define tf_n_heads 12
#define tf_n_layers 12
#define tf_rsqrt_d_k 0.125f
//clear && gcc 25_GPT2.c cJSON.c -lm -o m.o && ./m.o
enum SAFETENSORS_DTYPES
{
SAFETENSORS_F64 = 0,
SAFETENSORS_F32,
SAFETENSORS_F16,
SAFETENSORS_BF16,
SAFETENSORS_I64,
SAFETENSORS_I32,
SAFETENSORS_I16,
SAFETENSORS_I8,
SAFETENSORS_U8,
SAFETENSORS_BOOL,
SAFETENSORS_NUM_DTYPES
};
char dataTypes_String_Safetensors[SAFETENSORS_NUM_DTYPES][20] ={"F64","F32","F16","BF16","I64","I32","I16","I8","U8","BOOL"};
int GetSafetensorSize(int dtype)
{
switch(dtype)
{
case SAFETENSORS_F64: return 8;
case SAFETENSORS_F32: return 4;
case SAFETENSORS_F16: return 2;
case SAFETENSORS_BF16: return 2;
case SAFETENSORS_I64: return 8;
case SAFETENSORS_I32: return 4;
case SAFETENSORS_I16: return 2;
case SAFETENSORS_I8: return 1;
case SAFETENSORS_U8: return 1;
case SAFETENSORS_BOOL: return 1;
}
return 0;
}
typedef struct token_struct *Token;
typedef struct decoder_struct *Decoder;
typedef struct model_parameter_struct *ModelParameters;
typedef struct model_activation_struct *ModelActivations;
struct token_struct
{
uint32_t offset;
uint32_t size;
};
struct decoder_struct
{
Token *tokens;
char *rawData;
size_t rawDataLength;
};
struct model_parameter_struct
{
ModelActivations activations;
struct{float *weight;size_t length;}wte;
struct{float *weight;size_t length;}wpe;
struct
{
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}ln_1;
struct
{
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}c_attn;
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}c_proj;
}attn;
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}ln_2;
struct
{
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}c_fc;
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}c_proj;
}mlp;
}h[12];
struct{float *bias; float *weight;size_t biasLength;size_t weightLength;}ln_f;
};
struct model_activation_struct
{
struct{float out[tf_d_seq][tf_d_model];}embedding;
struct
{
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_1;
struct
{
struct{float out[tf_d_seq][3 * tf_d_model];}c_attn;
struct{float out[tf_n_heads][tf_d_seq][tf_d_seq];}softmax;
struct{float out[tf_d_seq][tf_d_model];}z;
struct{float out[12][tf_d_seq][tf_d_seq];}attn;
struct{float out[tf_d_seq][tf_d_model];}c_proj;
}attn;
struct{float out[tf_d_seq][tf_d_model];}res_1;
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_2;
struct
{
struct{float out[tf_d_seq][4 * tf_d_model];} c_fc;
struct{float out[tf_d_seq][4 * tf_d_model];}gelu;
struct{float out[tf_d_seq][tf_d_model];}c_proj;
}mlp;
struct{float out[tf_d_seq][tf_d_model];}res_2;
}h[12];
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_f;
struct{float out[tf_d_seq][tf_d_vocab];} unembedding;
};
size_t GetFileSize(char *fileName)
{
FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
fseek(fp, 0L, SEEK_END);
size_t currentFileSize = ftell(fp);rewind(fp);
fclose(fp);
return currentFileSize;
}
Decoder LoadTokenDecoder(char *vocabularyFileName)
{
/*Open the vocabulary file*/
size_t inputLength = GetFileSize(vocabularyFileName);
FILE *fp = fopen(vocabularyFileName, "rb");assert(fp != NULL);
int fileNumber = fileno(fp);
unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
Decoder tokenDecoder = malloc(sizeof(struct decoder_struct));
tokenDecoder->tokens = malloc(VOCABULARY_SIZE * sizeof(Token));
tokenDecoder->rawDataLength = inputLength - VOCABULARY_SIZE*8;
tokenDecoder->rawData= malloc(tokenDecoder->rawDataLength * sizeof(unsigned char));
memcpy(tokenDecoder->rawData, input + (VOCABULARY_SIZE * 8), tokenDecoder->rawDataLength * sizeof(unsigned char));
uint32_t tokenValue = 0;
uint32_t tokenOffset = 0;
uint32_t tokenSize = 0;
for(size_t i = 0, k = 0; i < VOCABULARY_SIZE*8; i += 8, k+=1)
{
tokenDecoder->tokens[k] = malloc(sizeof(struct token_struct));
tokenDecoder->tokens[k]->offset = 0;
tokenDecoder->tokens[k]->size = 0;
for(size_t j = i + 3; j-- > i;)
{
tokenDecoder->tokens[k]->offset <<= 8;
tokenDecoder->tokens[k]->offset += input[j];
//printf("%u\n", input[j]);
}
for(int j = i+7; j >= i + 4; j--)
{
tokenSize <<= 8;
tokenDecoder->tokens[k]->size += input[j];
//printf("%u\n", input[j]);
}
//printf("(%ld %u %u : %.*s)\n", k, tokenDecoder->tokens[k]->offset, tokenDecoder->tokens[k]->size,tokenDecoder->tokens[k]->size, tokenDecoder->rawData + tokenDecoder->tokens[k]->offset);
}
assert(munmap(input, inputLength) != -1);
fclose(fp);
return tokenDecoder;
}
uint16_t *GetTokenizedData(size_t *tokenCount, char *fileName, Decoder tokenDecoder)
{
size_t inputLength = GetFileSize(fileName);
FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
int fileNumber = fileno(fp);
unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
//printf("%ld\n", inputLength);
uint16_t *tokenizedData = calloc(inputLength / 2, sizeof(uint16_t));
for(size_t i = 0, k = 0; i < inputLength; i += 2, k++)
{
tokenizedData[k] = 0;
tokenizedData[k] += input[i+1];
tokenizedData[k] <<= 8;
tokenizedData[k] += input[i];
uint16_t token = tokenizedData[k];
assert(token < VOCABULARY_SIZE);
uint32_t offset = tokenDecoder->tokens[token]->offset;
uint32_t size = tokenDecoder->tokens[token]->size;
//printf("(%ld %u %u : %.*s)", k, offset, size,size, tokenDecoder->rawData + offset);
}
assert(munmap(input, inputLength) != -1);
fclose(fp);
return tokenizedData;
}
int GetTensorOffset(cJSON *tensorData, char *tensorName, size_t *tensorStart, size_t *tensorEnd)
{
int foundTensor = -1;
cJSON *item = NULL;
cJSON *offset = NULL;
cJSON *dtype = NULL;
cJSON *data_offsets = NULL;
cJSON *shape = NULL;
cJSON *eachShape = NULL;
cJSON_ArrayForEach(item, tensorData)
{
dtype = cJSON_GetObjectItem(item, "dtype");data_offsets = cJSON_GetObjectItem(item, "data_offsets");
shape = cJSON_GetObjectItem(item, "shape");
if(dtype && data_offsets && shape)
{
if(strcmp(tensorName, item->string) == 0)
{
//printf("Key: %s\n", item->string);printf(" dtype: %s\n", dtype->valuestring);printf(" data_offsets: ");
cJSON_ArrayForEach(offset, data_offsets)
{
foundTensor += 1;
if(foundTensor == 0)
{
*tensorStart = (size_t) offset->valuedouble;
}
else if(foundTensor == 1)
{
*tensorEnd = (size_t) offset->valuedouble;
}
}
break;
}
}
}
return foundTensor;
}
unsigned char *LoadSafeTensorData(char *fileName, size_t *fileSizeHolder)
{
size_t fileSize = GetFileSize(fileName);
FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
int fileNumber = fileno(fp);
unsigned char *fileData = mmap(NULL,fileSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(fileData != NULL);
fclose(fp);
*fileSizeHolder = fileSize;
return fileData;
}
ModelParameters GetModelParameters(size_t fileSize, unsigned char *fileData)
{
ModelParameters parameters = malloc(sizeof(struct model_parameter_struct));
parameters->activations = malloc(sizeof(struct model_activation_struct));
/*Read HeaderLength(1st 8 bytes in reverse)*/
size_t headerLength = 0;for(int i = 7; i >= 0; i--){headerLength <<= 8;headerLength += fileData[i];}assert(headerLength >= 0);
assert(fileData[8] == '{');
cJSON *tensorData = cJSON_ParseWithLength(fileData+8, headerLength);
assert(tensorData != NULL);
//Subtract 1 to remove metadata key
int tensorParameterSize = cJSON_GetArraySize(tensorData)-1;
assert(tensorParameterSize > 0);
//printf("%ld %d\n", headerLength, tensorParameterSize);
//char *formatted_json = cJSON_Print(tensorData);if(formatted_json != NULL){printf("%s\n", formatted_json);free(formatted_json);}
unsigned char *weightData = (fileData+8+headerLength);
size_t tensorOffsetStart = 0;
size_t tensorOffsetEnd = 0;
int foundTensor = 0;
foundTensor = GetTensorOffset(tensorData, "wte.weight", &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->wte.length = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->wte.weight = (float *) (weightData + tensorOffsetStart);
//printf("%ld %ld : %ld\n", tensorOffsetStart, tensorOffsetEnd, parameters->wte.length);
foundTensor = GetTensorOffset(tensorData, "wpe.weight", &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->wpe.length = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->wpe.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, "ln_f.bias", &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->ln_f.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->ln_f.bias = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, "ln_f.weight", &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->ln_f.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->ln_f.weight = (float *) (weightData + tensorOffsetStart);
for(int i = 0; i < 12; i++)
{
char layerWeightName_ln1[64] = {0};
char layerBiasName_ln1[64] = {0};
char layerWeightName_ln2[64] = {0};
char layerBiasName_ln2[64] = {0};
char layerWeightName_attn_cattn[64] = {0};
char layerBiasName_attn_cattn[64] = {0};
char layerWeightName_attn_c_proj[64] = {0};
char layerBiasName_attn_c_proj[64] = {0};
char layerWeightName_mlp_c_fc[64] = {0};
char layerBiasName_mlp_c_fc[64] = {0};
char layerWeightName_mlp_c_proj[64] = {0};
char layerBiasName_mlp_c_proj[64] = {0};
snprintf(layerWeightName_ln1, sizeof(layerWeightName_ln1), "h.%d.ln_1.weight", i);
snprintf(layerBiasName_ln1, sizeof(layerBiasName_ln1), "h.%d.ln_1.bias", i);
snprintf(layerWeightName_ln2, sizeof(layerWeightName_ln2), "h.%d.ln_2.weight", i);
snprintf(layerBiasName_ln2, sizeof(layerBiasName_ln2), "h.%d.ln_2.bias", i);
snprintf(layerWeightName_attn_cattn, sizeof(layerWeightName_attn_cattn), "h.%d.attn.c_attn.weight", i);
snprintf(layerBiasName_attn_cattn, sizeof(layerBiasName_attn_cattn), "h.%d.attn.c_attn.bias", i);
snprintf(layerWeightName_attn_c_proj, sizeof(layerWeightName_attn_c_proj), "h.%d.attn.c_proj.weight", i);
snprintf(layerBiasName_attn_c_proj, sizeof(layerBiasName_attn_c_proj), "h.%d.attn.c_proj.bias", i);
snprintf(layerWeightName_mlp_c_fc, sizeof(layerWeightName_mlp_c_fc), "h.%d.mlp.c_fc.weight", i);
snprintf(layerBiasName_mlp_c_fc, sizeof(layerBiasName_mlp_c_fc), "h.%d.mlp.c_fc.bias", i);
snprintf(layerWeightName_mlp_c_proj, sizeof(layerWeightName_mlp_c_proj), "h.%d.mlp.c_proj.weight", i);
snprintf(layerBiasName_mlp_c_proj, sizeof(layerBiasName_mlp_c_proj), "h.%d.mlp.c_proj.bias", i);
//printf("%s\n",layerName);
foundTensor = GetTensorOffset(tensorData, layerWeightName_ln1, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].ln_1.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].ln_1.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerBiasName_ln1, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].ln_1.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].ln_1.bias = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerWeightName_ln2, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].ln_2.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].ln_2.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerBiasName_ln2, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].ln_2.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].ln_2.bias = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerWeightName_attn_cattn, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].attn.c_attn.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].attn.c_attn.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerBiasName_attn_cattn, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].attn.c_attn.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].attn.c_attn.bias = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerWeightName_attn_c_proj, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].attn.c_proj.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].attn.c_proj.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerBiasName_attn_c_proj, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].attn.c_proj.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].attn.c_proj.bias = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerWeightName_mlp_c_fc, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].mlp.c_fc.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].mlp.c_fc.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerBiasName_mlp_c_fc, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].mlp.c_fc.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].mlp.c_fc.bias = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerWeightName_mlp_c_proj, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].mlp.c_proj.weightLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].mlp.c_proj.weight = (float *) (weightData + tensorOffsetStart);
foundTensor = GetTensorOffset(tensorData, layerBiasName_mlp_c_proj, &tensorOffsetStart, &tensorOffsetEnd);assert(foundTensor == 1);assert(tensorOffsetEnd > tensorOffsetStart);assert((tensorOffsetEnd - tensorOffsetStart) % 4 == 0);
parameters->h[i].mlp.c_proj.biasLength = (tensorOffsetEnd - tensorOffsetStart) / 4;
parameters->h[i].mlp.c_proj.bias = (float *) (weightData + tensorOffsetStart);
}
cJSON_Delete(tensorData);
return parameters;
}
int main()
{
char *vocabularyFileName = "../Datasets/GPT2Tensors/enc";
char *textFileName = "../Datasets/GPT2Tensors/data";
char *safeTensorFileName = "../Datasets/GPT2Tensors/model.safetensors";
size_t tokenCount = 0;
Decoder tokenDecoder = LoadTokenDecoder(vocabularyFileName);
uint16_t *tokenizedData = GetTokenizedData(&tokenCount, textFileName, tokenDecoder);
size_t safeTensorDataSize = 0;
unsigned char *safeTensorData = LoadSafeTensorData(safeTensorFileName, &safeTensorDataSize);
ModelParameters parameters = GetModelParameters(safeTensorDataSize, safeTensorData);
/*Forward Pass*/
/*Embedding*/
size_t inputSize = 64;
for(size_t i = 0; i < inputSize; i++)
{
uint16_t token = tokenizedData[i];
float *wteRow = parameters->wte.weight + token * tf_d_model;
float *wpeRow = parameters->wpe.weight + i * tf_d_model;
float *output = (float *)parameters->activations->embedding.out + i * tf_d_model;
float *outputEnd = output + tf_d_model;
for(; output!=outputEnd; wteRow++,wpeRow++,output++)
{
*output = *wteRow + *wpeRow;
}
}
double total = 0.0f;
for(size_t i = 0; i < inputSize * tf_d_model; i++)
{
total += (double) ((float *) parameters->activations->embedding.out)[i];
}
printf("Test embedding : %f\n",total);
/*Layer norm*/
int layer_i = 0;
for(size_t i = 0; i < inputSize; i++)
{
float *input = (float *) parameters->activations->embedding.out + i * tf_d_model;
float *inputEnd = input + tf_d_model;
float *inputReset = input;
float mean = 0.0f;
for(; input != inputEnd; input++)
{
mean += *input;
}
mean /= tf_d_model;
float total_diff_sq = 0.0f;
for(input = inputReset; input != inputEnd; input++)
{
float diff = *input - mean;
total_diff_sq += diff * diff;
}
float r_stddev = 1.0f / sqrtf(total_diff_sq / tf_d_model);
float *output = (float *) parameters->activations->h[layer_i].ln_1.out + i * tf_d_model;
float *weight = (float *) parameters->h[layer_i].ln_1.weight;
float *bias = (float *) parameters->h[layer_i].ln_1.bias;
for(input = inputReset; input != inputEnd; input++, weight++, bias++, output++)
{
float in_norm = (*input - mean) * r_stddev;
*output = in_norm * *weight + *bias;
}
}
total = 0.0f;
for(size_t i = 0; i < inputSize * tf_d_model; i++)
{
total += (double) ((float *) parameters->activations->h[layer_i].ln_1.out)[i];
}
printf("Test layer norm : %f\n",total);
for(size_t i = 0; i < inputSize; i++)
{
float *in = (float *) parameters->activations->h[layer_i].ln_1.out + i * tf_d_model;
float *weight = parameters->h[layer_i].attn.c_attn.weight;
float *weightEnd = weight + tf_d_model * 3 * tf_d_model;
float *bias = parameters->h[layer_i].attn.c_attn.bias;
float *out = (float *) parameters->activations->h[layer_i].attn.c_attn.out + i * 3 * tf_d_model;
float *outEnd = out + 3 * tf_d_model;
float *outReset = out;
memcpy(out, bias, 3 * tf_d_model * sizeof(float));
while(true)
{
*out += *weight * *in;
weight++;
out++;
if(out == outEnd)
{
out = outReset;
in++;
if(weight == weightEnd)
{
break;
}
}
}
}
total = 0.0f;
for(size_t i = 0; i < inputSize * 3 * tf_d_model; i++)
{
total += (double) ((float *) parameters->activations->h[layer_i].attn.c_attn.out)[i];
}
printf("Test Attention : %f %ld\n",total, parameters->h[layer_i].attn.c_attn.weightLength);
/*Heads*/
memset(parameters->activations->h[layer_i].attn.z.out, 0, sizeof(parameters->activations->h[layer_i].attn.z.out));
for(size_t head_i = 0; head_i < 12; head_i++)
{
for(size_t q_i = 0; q_i < inputSize; q_i++)
{
float softmax_max = -INFINITY;
for(size_t k_i = 0; k_i <= q_i; k_i++)
{
float *q = (float *)parameters->activations->h[layer_i].attn.c_attn.out + q_i * 3 * tf_d_model + head_i * tf_d_k;
float *q_end = q + tf_d_k;
float *k = (float *)parameters->activations->h[layer_i].attn.c_attn.out + k_i * 3 * tf_d_model + tf_d_model + head_i * tf_d_k;
float dot = 0.0f;
for(; q!= q_end; q++, k++)
{
dot += *q * *k;
}
dot *= tf_rsqrt_d_k;
parameters->activations->h[layer_i].attn.attn.out[head_i][q_i][k_i] = dot;
if(dot > softmax_max)
{
softmax_max = dot;
}
}
float softmax_sum = 0.0f;
for(size_t k_i = 0; k_i <= q_i; k_i++)
{
float e = parameters->activations->h[layer_i].attn.attn.out[head_i][q_i][k_i];
float softmax_exp_i = expf(e - softmax_max);
parameters->activations->h[layer_i].attn.softmax.out[head_i][q_i][k_i] = softmax_exp_i;
softmax_sum += softmax_exp_i;
}
float r_softmax_sum = 1.0f / softmax_sum;
for(size_t k_i = 0; k_i <= q_i; k_i++)
{
parameters->activations->h[layer_i].attn.softmax.out[head_i][q_i][k_i] *= r_softmax_sum;
}
for(size_t v_i = 0; v_i <= q_i; v_i++)
{
float *v = (float *)parameters->activations->h[layer_i].attn.c_attn.out + v_i * 3 * tf_d_model + 2 * tf_d_model + head_i * tf_d_k;
float *v_end = v + tf_d_k;
float *z = (float *)parameters->activations->h[layer_i].attn.z.out + q_i * tf_d_model + head_i * tf_d_k;
float factor = parameters->activations->h[layer_i].attn.softmax.out[head_i][q_i][v_i];
for(; v != v_end; v++, z++)
{
*z += *v * factor;
}
}
}
}
total = 0.0f;
for(size_t i = 0; i < inputSize * tf_d_model; i++)
{
total += (double) ((float *) parameters->activations->h[layer_i].attn.z.out)[i];
}
printf("Test Head : %f \n",total);
free(parameters->activations);
free(parameters);
assert(munmap(safeTensorData, safeTensorDataSize) != -1);
free(tokenizedData);
free(tokenDecoder->rawData);
for(size_t i = 0; i < VOCABULARY_SIZE; i++){free(tokenDecoder->tokens[i]);}
free(tokenDecoder->tokens);
free(tokenDecoder);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment