Skip to content

Instantly share code, notes, and snippets.

@MurageKibicho
Created March 27, 2025 13:08
Show Gist options
  • Save MurageKibicho/cd1463fbca2adc833bcdc300ce3698a3 to your computer and use it in GitHub Desktop.
Save MurageKibicho/cd1463fbca2adc833bcdc300ce3698a3 to your computer and use it in GitHub Desktop.
GPT in C before cahing safe tensors loading
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include "cJSON.h"
#define VOCABULARY_SIZE 50257
#define tf_d_vocab 50257
#define tf_d_seq 1024
#define tf_d_model 768
#define tf_d_k 64
#define tf_n_heads 12
#define tf_n_layers 12
#define tf_rsqrt_d_k 0.125f
//clear && gcc 25_GPT2.c cJSON.c -lm -o m.o && ./m.o
enum SAFETENSORS_DTYPES
{
SAFETENSORS_F64 = 0,
SAFETENSORS_F32,
SAFETENSORS_F16,
SAFETENSORS_BF16,
SAFETENSORS_I64,
SAFETENSORS_I32,
SAFETENSORS_I16,
SAFETENSORS_I8,
SAFETENSORS_U8,
SAFETENSORS_BOOL,
SAFETENSORS_NUM_DTYPES
};
char dataTypes_String_Safetensors[SAFETENSORS_NUM_DTYPES][20] ={"F64","F32","F16","BF16","I64","I32","I16","I8","U8","BOOL"};
int GetSafetensorSize(int dtype)
{
switch(dtype)
{
case SAFETENSORS_F64: return 8;
case SAFETENSORS_F32: return 4;
case SAFETENSORS_F16: return 2;
case SAFETENSORS_BF16: return 2;
case SAFETENSORS_I64: return 8;
case SAFETENSORS_I32: return 4;
case SAFETENSORS_I16: return 2;
case SAFETENSORS_I8: return 1;
case SAFETENSORS_U8: return 1;
case SAFETENSORS_BOOL: return 1;
}
return 0;
}
typedef struct token_struct *Token;
typedef struct decoder_struct *Decoder;
typedef struct model_parameter_struct *ModelParameters;
typedef struct model_activation_struct *ModelActivations;
struct token_struct
{
uint32_t offset;
uint32_t size;
};
struct decoder_struct
{
Token *tokens;
char *rawData;
size_t rawDataLength;
};
struct model_parameter_struct
{
struct{float *weight;}wte;
struct{float *weight;}wpe;
struct
{
struct{float *bias; float *weight;}ln_1;
struct
{
struct{float *bias; float *weight;}c_attn;
struct{float *bias; float *weight;}c_proj;
}attn;
struct{float *bias; float *weight;}ln_2;
struct
{
struct{float *bias; float *weight;}c_fc;
struct{float *bias; float *weight;}c_proj;
}mlp;
}h[12];
struct{float *bias; float *weight;}ln_f;
};
struct model_activation_struct
{
struct{float out[tf_d_seq][tf_d_model];}embedding;
struct
{
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_1;
struct
{
struct{float out[tf_d_seq][3 * tf_d_model];}c_attn;
struct{float out[tf_n_heads][tf_d_seq][tf_d_seq];}softmax;
struct{float out[tf_d_seq][tf_d_model];}z;
struct{float out[tf_d_seq][tf_d_model];}c_proj;
}attn;
struct{float out[tf_d_seq][tf_d_model];}res_1;
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_2;
struct
{
struct{float out[tf_d_seq][4 * tf_d_model];} c_fc;
struct{float out[tf_d_seq][4 * tf_d_model];}gelu;
struct{float out[tf_d_seq][tf_d_model];}c_proj;
}mlp;
struct{float out[tf_d_seq][tf_d_model];}res_2;
}h[12];
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_f;
struct{float out[tf_d_seq][tf_d_vocab];} unembedding;
};
size_t GetFileSize(char *fileName)
{
FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
fseek(fp, 0L, SEEK_END);
size_t currentFileSize = ftell(fp);rewind(fp);
fclose(fp);
return currentFileSize;
}
Decoder LoadTokenDecoder(char *vocabularyFileName)
{
/*Open the vocabulary file*/
size_t inputLength = GetFileSize(vocabularyFileName);
FILE *fp = fopen(vocabularyFileName, "rb");assert(fp != NULL);
int fileNumber = fileno(fp);
unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
Decoder tokenDecoder = malloc(sizeof(struct decoder_struct));
tokenDecoder->tokens = malloc(VOCABULARY_SIZE * sizeof(Token));
tokenDecoder->rawDataLength = inputLength - VOCABULARY_SIZE*8;
tokenDecoder->rawData= malloc(tokenDecoder->rawDataLength * sizeof(unsigned char));
memcpy(tokenDecoder->rawData, input + (VOCABULARY_SIZE * 8), tokenDecoder->rawDataLength * sizeof(unsigned char));
uint32_t tokenValue = 0;
uint32_t tokenOffset = 0;
uint32_t tokenSize = 0;
for(size_t i = 0, k = 0; i < VOCABULARY_SIZE*8; i += 8, k+=1)
{
tokenDecoder->tokens[k] = malloc(sizeof(struct token_struct));
tokenDecoder->tokens[k]->offset = 0;
tokenDecoder->tokens[k]->size = 0;
for(size_t j = i + 3; j-- > i;)
{
tokenDecoder->tokens[k]->offset <<= 8;
tokenDecoder->tokens[k]->offset += input[j];
//printf("%u\n", input[j]);
}
for(int j = i+7; j >= i + 4; j--)
{
tokenSize <<= 8;
tokenDecoder->tokens[k]->size += input[j];
//printf("%u\n", input[j]);
}
//printf("(%ld %u %u : %.*s)\n", k, tokenDecoder->tokens[k]->offset, tokenDecoder->tokens[k]->size,tokenDecoder->tokens[k]->size, tokenDecoder->rawData + tokenDecoder->tokens[k]->offset);
}
assert(munmap(input, inputLength) != -1);
fclose(fp);
return tokenDecoder;
}
uint16_t *GetTokenizedData(size_t *tokenCount, char *fileName, Decoder tokenDecoder)
{
size_t inputLength = GetFileSize(fileName);
FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
int fileNumber = fileno(fp);
unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
//printf("%ld\n", inputLength);
uint16_t *tokenizedData = calloc(inputLength / 2, sizeof(uint16_t));
for(size_t i = 0, k = 0; i < inputLength; i += 2, k++)
{
tokenizedData[k] = 0;
tokenizedData[k] += input[i+1];
tokenizedData[k] <<= 8;
tokenizedData[k] += input[i];
uint16_t token = tokenizedData[k];
assert(token < VOCABULARY_SIZE);
uint32_t offset = tokenDecoder->tokens[token]->offset;
uint32_t size = tokenDecoder->tokens[token]->size;
//printf("(%ld %u %u : %.*s)", k, offset, size,size, tokenDecoder->rawData + offset);
}
assert(munmap(input, inputLength) != -1);
fclose(fp);
return tokenizedData;
}
cJSON *LoadSafeTensorData(char *fileName)
{
size_t fileSize = GetFileSize(fileName);
FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
int fileNumber = fileno(fp);
unsigned char *fileData = mmap(NULL,fileSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(fileData != NULL);
/*Read HeaderLength(1st 8 bytes in reverse)*/
size_t headerLength = 0;for(int i = 7; i >= 0; i--){headerLength <<= 8;headerLength += fileData[i];}assert(headerLength >= 0);
assert(fileData[8] == '{');
cJSON *tensorData = cJSON_ParseWithLength(fileData+8, headerLength);
assert(tensorData != NULL);
int tensorParameterSize = cJSON_GetArraySize(tensorData)-1;
assert(tensorParameterSize > 0);
//printf("%ld %d\n", headerLength, tensorParameterSize);
//char *formatted_json = cJSON_Print(tensorData);if(formatted_json != NULL){printf("%s\n", formatted_json);free(formatted_json);}
assert(munmap(fileData, fileSize) != -1);
fclose(fp);
return tensorData;
}
size_t GetTensorOffset(cJSON *tensorData, char *tensorName)
{
size_t tensorOffset = 0;
cJSON *item = NULL;
cJSON *offset = NULL;
cJSON *dtype = NULL;
cJSON *data_offsets = NULL;
cJSON *shape = NULL;
cJSON *eachShape = NULL;
cJSON_ArrayForEach(item, tensorData)
{
dtype = cJSON_GetObjectItem(item, "dtype");data_offsets = cJSON_GetObjectItem(item, "data_offsets");
shape = cJSON_GetObjectItem(item, "shape");
if(dtype && data_offsets && shape)
{
if(strcmp(tensorName, item->string) == 0)
{
//printf("Key: %s\n", item->string);printf(" dtype: %s\n", dtype->valuestring);printf(" data_offsets: ");
cJSON_ArrayForEach(offset, data_offsets)
{
tensorOffset = (size_t) offset->valuedouble;break;
}
break;
}
}
}
return tensorOffset;
}
int main()
{
char *vocabularyFileName = "../Datasets/GPT2Tensors/enc";
char *textFileName = "../Datasets/GPT2Tensors/data";
char *safeTensorFileName = "../Datasets/GPT2Tensors/model.safetensors";
size_t tokenCount = 0;
Decoder tokenDecoder = LoadTokenDecoder(vocabularyFileName);
uint16_t *tokenizedData = GetTokenizedData(&tokenCount, textFileName, tokenDecoder);
ModelParameters parameters = malloc(sizeof(struct model_parameter_struct));
ModelActivations activations = malloc(sizeof(struct model_activation_struct));
cJSON *tensorData = LoadSafeTensorData(safeTensorFileName);
size_t tensorOffset = GetTensorOffset(tensorData, "h.3.ln_2.bias");
printf("%ld\n", tensorOffset);
free(activations);
free(parameters);
free(tokenizedData);
free(tokenDecoder->rawData);
for(size_t i = 0; i < VOCABULARY_SIZE; i++){free(tokenDecoder->tokens[i]);}
free(tokenDecoder->tokens);
free(tokenDecoder);
cJSON_Delete(tensorData);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment