Created
March 27, 2025 13:08
-
-
Save MurageKibicho/cd1463fbca2adc833bcdc300ce3698a3 to your computer and use it in GitHub Desktop.
GPT in C before cahing safe tensors loading
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <sys/mman.h> | |
#include "cJSON.h" | |
#define VOCABULARY_SIZE 50257 | |
#define tf_d_vocab 50257 | |
#define tf_d_seq 1024 | |
#define tf_d_model 768 | |
#define tf_d_k 64 | |
#define tf_n_heads 12 | |
#define tf_n_layers 12 | |
#define tf_rsqrt_d_k 0.125f | |
//clear && gcc 25_GPT2.c cJSON.c -lm -o m.o && ./m.o | |
enum SAFETENSORS_DTYPES | |
{ | |
SAFETENSORS_F64 = 0, | |
SAFETENSORS_F32, | |
SAFETENSORS_F16, | |
SAFETENSORS_BF16, | |
SAFETENSORS_I64, | |
SAFETENSORS_I32, | |
SAFETENSORS_I16, | |
SAFETENSORS_I8, | |
SAFETENSORS_U8, | |
SAFETENSORS_BOOL, | |
SAFETENSORS_NUM_DTYPES | |
}; | |
char dataTypes_String_Safetensors[SAFETENSORS_NUM_DTYPES][20] ={"F64","F32","F16","BF16","I64","I32","I16","I8","U8","BOOL"}; | |
int GetSafetensorSize(int dtype) | |
{ | |
switch(dtype) | |
{ | |
case SAFETENSORS_F64: return 8; | |
case SAFETENSORS_F32: return 4; | |
case SAFETENSORS_F16: return 2; | |
case SAFETENSORS_BF16: return 2; | |
case SAFETENSORS_I64: return 8; | |
case SAFETENSORS_I32: return 4; | |
case SAFETENSORS_I16: return 2; | |
case SAFETENSORS_I8: return 1; | |
case SAFETENSORS_U8: return 1; | |
case SAFETENSORS_BOOL: return 1; | |
} | |
return 0; | |
} | |
typedef struct token_struct *Token; | |
typedef struct decoder_struct *Decoder; | |
typedef struct model_parameter_struct *ModelParameters; | |
typedef struct model_activation_struct *ModelActivations; | |
struct token_struct | |
{ | |
uint32_t offset; | |
uint32_t size; | |
}; | |
struct decoder_struct | |
{ | |
Token *tokens; | |
char *rawData; | |
size_t rawDataLength; | |
}; | |
struct model_parameter_struct | |
{ | |
struct{float *weight;}wte; | |
struct{float *weight;}wpe; | |
struct | |
{ | |
struct{float *bias; float *weight;}ln_1; | |
struct | |
{ | |
struct{float *bias; float *weight;}c_attn; | |
struct{float *bias; float *weight;}c_proj; | |
}attn; | |
struct{float *bias; float *weight;}ln_2; | |
struct | |
{ | |
struct{float *bias; float *weight;}c_fc; | |
struct{float *bias; float *weight;}c_proj; | |
}mlp; | |
}h[12]; | |
struct{float *bias; float *weight;}ln_f; | |
}; | |
struct model_activation_struct | |
{ | |
struct{float out[tf_d_seq][tf_d_model];}embedding; | |
struct | |
{ | |
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_1; | |
struct | |
{ | |
struct{float out[tf_d_seq][3 * tf_d_model];}c_attn; | |
struct{float out[tf_n_heads][tf_d_seq][tf_d_seq];}softmax; | |
struct{float out[tf_d_seq][tf_d_model];}z; | |
struct{float out[tf_d_seq][tf_d_model];}c_proj; | |
}attn; | |
struct{float out[tf_d_seq][tf_d_model];}res_1; | |
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_2; | |
struct | |
{ | |
struct{float out[tf_d_seq][4 * tf_d_model];} c_fc; | |
struct{float out[tf_d_seq][4 * tf_d_model];}gelu; | |
struct{float out[tf_d_seq][tf_d_model];}c_proj; | |
}mlp; | |
struct{float out[tf_d_seq][tf_d_model];}res_2; | |
}h[12]; | |
struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_f; | |
struct{float out[tf_d_seq][tf_d_vocab];} unembedding; | |
}; | |
size_t GetFileSize(char *fileName) | |
{ | |
FILE *fp = fopen(fileName, "rb");assert(fp != NULL); | |
fseek(fp, 0L, SEEK_END); | |
size_t currentFileSize = ftell(fp);rewind(fp); | |
fclose(fp); | |
return currentFileSize; | |
} | |
Decoder LoadTokenDecoder(char *vocabularyFileName) | |
{ | |
/*Open the vocabulary file*/ | |
size_t inputLength = GetFileSize(vocabularyFileName); | |
FILE *fp = fopen(vocabularyFileName, "rb");assert(fp != NULL); | |
int fileNumber = fileno(fp); | |
unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL); | |
Decoder tokenDecoder = malloc(sizeof(struct decoder_struct)); | |
tokenDecoder->tokens = malloc(VOCABULARY_SIZE * sizeof(Token)); | |
tokenDecoder->rawDataLength = inputLength - VOCABULARY_SIZE*8; | |
tokenDecoder->rawData= malloc(tokenDecoder->rawDataLength * sizeof(unsigned char)); | |
memcpy(tokenDecoder->rawData, input + (VOCABULARY_SIZE * 8), tokenDecoder->rawDataLength * sizeof(unsigned char)); | |
uint32_t tokenValue = 0; | |
uint32_t tokenOffset = 0; | |
uint32_t tokenSize = 0; | |
for(size_t i = 0, k = 0; i < VOCABULARY_SIZE*8; i += 8, k+=1) | |
{ | |
tokenDecoder->tokens[k] = malloc(sizeof(struct token_struct)); | |
tokenDecoder->tokens[k]->offset = 0; | |
tokenDecoder->tokens[k]->size = 0; | |
for(size_t j = i + 3; j-- > i;) | |
{ | |
tokenDecoder->tokens[k]->offset <<= 8; | |
tokenDecoder->tokens[k]->offset += input[j]; | |
//printf("%u\n", input[j]); | |
} | |
for(int j = i+7; j >= i + 4; j--) | |
{ | |
tokenSize <<= 8; | |
tokenDecoder->tokens[k]->size += input[j]; | |
//printf("%u\n", input[j]); | |
} | |
//printf("(%ld %u %u : %.*s)\n", k, tokenDecoder->tokens[k]->offset, tokenDecoder->tokens[k]->size,tokenDecoder->tokens[k]->size, tokenDecoder->rawData + tokenDecoder->tokens[k]->offset); | |
} | |
assert(munmap(input, inputLength) != -1); | |
fclose(fp); | |
return tokenDecoder; | |
} | |
uint16_t *GetTokenizedData(size_t *tokenCount, char *fileName, Decoder tokenDecoder) | |
{ | |
size_t inputLength = GetFileSize(fileName); | |
FILE *fp = fopen(fileName, "rb");assert(fp != NULL); | |
int fileNumber = fileno(fp); | |
unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL); | |
//printf("%ld\n", inputLength); | |
uint16_t *tokenizedData = calloc(inputLength / 2, sizeof(uint16_t)); | |
for(size_t i = 0, k = 0; i < inputLength; i += 2, k++) | |
{ | |
tokenizedData[k] = 0; | |
tokenizedData[k] += input[i+1]; | |
tokenizedData[k] <<= 8; | |
tokenizedData[k] += input[i]; | |
uint16_t token = tokenizedData[k]; | |
assert(token < VOCABULARY_SIZE); | |
uint32_t offset = tokenDecoder->tokens[token]->offset; | |
uint32_t size = tokenDecoder->tokens[token]->size; | |
//printf("(%ld %u %u : %.*s)", k, offset, size,size, tokenDecoder->rawData + offset); | |
} | |
assert(munmap(input, inputLength) != -1); | |
fclose(fp); | |
return tokenizedData; | |
} | |
cJSON *LoadSafeTensorData(char *fileName) | |
{ | |
size_t fileSize = GetFileSize(fileName); | |
FILE *fp = fopen(fileName, "rb");assert(fp != NULL); | |
int fileNumber = fileno(fp); | |
unsigned char *fileData = mmap(NULL,fileSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(fileData != NULL); | |
/*Read HeaderLength(1st 8 bytes in reverse)*/ | |
size_t headerLength = 0;for(int i = 7; i >= 0; i--){headerLength <<= 8;headerLength += fileData[i];}assert(headerLength >= 0); | |
assert(fileData[8] == '{'); | |
cJSON *tensorData = cJSON_ParseWithLength(fileData+8, headerLength); | |
assert(tensorData != NULL); | |
int tensorParameterSize = cJSON_GetArraySize(tensorData)-1; | |
assert(tensorParameterSize > 0); | |
//printf("%ld %d\n", headerLength, tensorParameterSize); | |
//char *formatted_json = cJSON_Print(tensorData);if(formatted_json != NULL){printf("%s\n", formatted_json);free(formatted_json);} | |
assert(munmap(fileData, fileSize) != -1); | |
fclose(fp); | |
return tensorData; | |
} | |
size_t GetTensorOffset(cJSON *tensorData, char *tensorName) | |
{ | |
size_t tensorOffset = 0; | |
cJSON *item = NULL; | |
cJSON *offset = NULL; | |
cJSON *dtype = NULL; | |
cJSON *data_offsets = NULL; | |
cJSON *shape = NULL; | |
cJSON *eachShape = NULL; | |
cJSON_ArrayForEach(item, tensorData) | |
{ | |
dtype = cJSON_GetObjectItem(item, "dtype");data_offsets = cJSON_GetObjectItem(item, "data_offsets"); | |
shape = cJSON_GetObjectItem(item, "shape"); | |
if(dtype && data_offsets && shape) | |
{ | |
if(strcmp(tensorName, item->string) == 0) | |
{ | |
//printf("Key: %s\n", item->string);printf(" dtype: %s\n", dtype->valuestring);printf(" data_offsets: "); | |
cJSON_ArrayForEach(offset, data_offsets) | |
{ | |
tensorOffset = (size_t) offset->valuedouble;break; | |
} | |
break; | |
} | |
} | |
} | |
return tensorOffset; | |
} | |
int main() | |
{ | |
char *vocabularyFileName = "../Datasets/GPT2Tensors/enc"; | |
char *textFileName = "../Datasets/GPT2Tensors/data"; | |
char *safeTensorFileName = "../Datasets/GPT2Tensors/model.safetensors"; | |
size_t tokenCount = 0; | |
Decoder tokenDecoder = LoadTokenDecoder(vocabularyFileName); | |
uint16_t *tokenizedData = GetTokenizedData(&tokenCount, textFileName, tokenDecoder); | |
ModelParameters parameters = malloc(sizeof(struct model_parameter_struct)); | |
ModelActivations activations = malloc(sizeof(struct model_activation_struct)); | |
cJSON *tensorData = LoadSafeTensorData(safeTensorFileName); | |
size_t tensorOffset = GetTensorOffset(tensorData, "h.3.ln_2.bias"); | |
printf("%ld\n", tensorOffset); | |
free(activations); | |
free(parameters); | |
free(tokenizedData); | |
free(tokenDecoder->rawData); | |
for(size_t i = 0; i < VOCABULARY_SIZE; i++){free(tokenDecoder->tokens[i]);} | |
free(tokenDecoder->tokens); | |
free(tokenDecoder); | |
cJSON_Delete(tensorData); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment