MurageKibicho · March 27, 2025 13:08
diff --git a/gpt.c b/gpt.c
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <sys/mman.h>
 #include "cJSON.h" 
 #define VOCABULARY_SIZE 50257
 #define tf_d_vocab 50257
 #define tf_d_seq 1024
 #define tf_d_model 768
 #define tf_d_k 64
 #define tf_n_heads 12
 #define tf_n_layers 12
 #define tf_rsqrt_d_k 0.125f
 //clear && gcc 25_GPT2.c cJSON.c -lm -o m.o && ./m.o
 enum SAFETENSORS_DTYPES
 {
 	SAFETENSORS_F64 = 0,
 	SAFETENSORS_F32,
 	SAFETENSORS_F16,
 	SAFETENSORS_BF16,
 	SAFETENSORS_I64,
 	SAFETENSORS_I32,
 	SAFETENSORS_I16,
 	SAFETENSORS_I8,
 	SAFETENSORS_U8,
 	SAFETENSORS_BOOL,
 	SAFETENSORS_NUM_DTYPES
 };
 char dataTypes_String_Safetensors[SAFETENSORS_NUM_DTYPES][20] ={"F64","F32","F16","BF16","I64","I32","I16","I8","U8","BOOL"};
 int GetSafetensorSize(int dtype)
 {
 	switch(dtype)
 	{
 		case SAFETENSORS_F64:  return 8;
 		case SAFETENSORS_F32:  return 4;
 		case SAFETENSORS_F16:  return 2;
 		case SAFETENSORS_BF16: return 2;
 		case SAFETENSORS_I64:  return 8;
 		case SAFETENSORS_I32:  return 4;
 		case SAFETENSORS_I16:  return 2;
 		case SAFETENSORS_I8:   return 1;
 		case SAFETENSORS_U8:   return 1;
 		case SAFETENSORS_BOOL: return 1; 
 	}
 	return 0;
 }
 typedef struct token_struct *Token;
 typedef struct decoder_struct *Decoder;
 typedef struct model_parameter_struct *ModelParameters;
 typedef struct model_activation_struct *ModelActivations;
 struct token_struct
 {
 	uint32_t offset;
 	uint32_t size;
 };
 struct decoder_struct
 {
 	Token *tokens;
 	char *rawData;
 	size_t rawDataLength;
 };
 struct model_parameter_struct
 {
 	struct{float *weight;}wte;
 	struct{float *weight;}wpe;
 	struct
 	{
 		struct{float *bias; float *weight;}ln_1;
 		struct
 		{
 			struct{float *bias; float *weight;}c_attn;
 			struct{float *bias; float *weight;}c_proj;
 		}attn;
 		struct{float *bias; float *weight;}ln_2;
 		struct
 		{
 			struct{float *bias; float *weight;}c_fc;
 			struct{float *bias; float *weight;}c_proj;
 		}mlp;
 	}h[12];
 	struct{float *bias; float *weight;}ln_f;
 };

 struct model_activation_struct
 {
 	struct{float out[tf_d_seq][tf_d_model];}embedding;
 	struct
 	{
 		struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_1;
 		struct
 		{
 			struct{float out[tf_d_seq][3 * tf_d_model];}c_attn;
 			struct{float out[tf_n_heads][tf_d_seq][tf_d_seq];}softmax;
 			struct{float out[tf_d_seq][tf_d_model];}z;
 			struct{float out[tf_d_seq][tf_d_model];}c_proj;
 		}attn;
 		struct{float out[tf_d_seq][tf_d_model];}res_1;
 		struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_2;
 		struct
 		{
 			struct{float out[tf_d_seq][4 * tf_d_model];} c_fc;
 			struct{float out[tf_d_seq][4 * tf_d_model];}gelu;
 			struct{float out[tf_d_seq][tf_d_model];}c_proj;
 		}mlp;
 		struct{float out[tf_d_seq][tf_d_model];}res_2;
 	}h[12];
 	struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_f;
 	struct{float out[tf_d_seq][tf_d_vocab];} unembedding;
 };

 size_t GetFileSize(char *fileName)
 {
 	FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
 	fseek(fp, 0L, SEEK_END);
 	size_t currentFileSize = ftell(fp);rewind(fp);
 	fclose(fp);
 	return currentFileSize;
 }

 Decoder LoadTokenDecoder(char *vocabularyFileName)
 {
 	/*Open the vocabulary file*/
 	size_t inputLength = GetFileSize(vocabularyFileName);
 	FILE *fp = fopen(vocabularyFileName, "rb");assert(fp != NULL);
 	int fileNumber = fileno(fp);
 	unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
 	
 	Decoder tokenDecoder = malloc(sizeof(struct decoder_struct));
 	tokenDecoder->tokens = malloc(VOCABULARY_SIZE * sizeof(Token));
 	tokenDecoder->rawDataLength = inputLength - VOCABULARY_SIZE*8;
 	tokenDecoder->rawData= malloc(tokenDecoder->rawDataLength * sizeof(unsigned char));
 	
 	memcpy(tokenDecoder->rawData, input + (VOCABULARY_SIZE * 8), tokenDecoder->rawDataLength * sizeof(unsigned char));
 	uint32_t tokenValue = 0;
 	uint32_t tokenOffset = 0;
 	uint32_t tokenSize   = 0;
 	for(size_t i = 0, k = 0; i < VOCABULARY_SIZE*8; i += 8, k+=1)
 	{
 		tokenDecoder->tokens[k] = malloc(sizeof(struct token_struct));
 		tokenDecoder->tokens[k]->offset = 0;
 		tokenDecoder->tokens[k]->size = 0;
 		for(size_t j = i + 3; j-- > i;)
 		{
 			tokenDecoder->tokens[k]->offset <<= 8;
 			tokenDecoder->tokens[k]->offset += input[j];
 			//printf("%u\n", input[j]);
 		}
 		for(int j = i+7; j >= i + 4; j--)
 		{
 			tokenSize <<= 8;
 			tokenDecoder->tokens[k]->size += input[j];
 			//printf("%u\n", input[j]);
 		}
 		//printf("(%ld %u %u : %.*s)\n", k, tokenDecoder->tokens[k]->offset, tokenDecoder->tokens[k]->size,tokenDecoder->tokens[k]->size, tokenDecoder->rawData + tokenDecoder->tokens[k]->offset);
 	}
 	assert(munmap(input, inputLength) != -1);
 	fclose(fp);
 	return tokenDecoder;
 }

 uint16_t *GetTokenizedData(size_t *tokenCount, char *fileName, Decoder tokenDecoder)
 {
 	size_t inputLength = GetFileSize(fileName);
 	FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
 	int fileNumber = fileno(fp);
 	unsigned char *input = mmap(NULL,inputLength, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
 	//printf("%ld\n", inputLength);
 	uint16_t *tokenizedData = calloc(inputLength / 2, sizeof(uint16_t));
 	for(size_t i = 0, k = 0; i < inputLength; i += 2, k++)
 	{
 		tokenizedData[k] = 0;
 		tokenizedData[k] += input[i+1];
 		tokenizedData[k] <<= 8;
 		tokenizedData[k] += input[i];
 		uint16_t token = tokenizedData[k];
 		assert(token < VOCABULARY_SIZE);
 		uint32_t offset = tokenDecoder->tokens[token]->offset;
 		uint32_t size  = tokenDecoder->tokens[token]->size;
 		//printf("(%ld %u %u : %.*s)", k, offset, size,size, tokenDecoder->rawData + offset);
 	}
 	
 	assert(munmap(input, inputLength) != -1);
 	fclose(fp);
 	return tokenizedData;
 }

 cJSON *LoadSafeTensorData(char *fileName)
 {
 	size_t fileSize = GetFileSize(fileName);
 	FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
 	int fileNumber = fileno(fp);
 	unsigned char *fileData = mmap(NULL,fileSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(fileData != NULL);
 	/*Read HeaderLength(1st 8 bytes in reverse)*/
 	size_t headerLength = 0;for(int i = 7; i >= 0; i--){headerLength <<= 8;headerLength += fileData[i];}assert(headerLength >= 0);
 	assert(fileData[8] == '{');
 	
 	cJSON *tensorData = cJSON_ParseWithLength(fileData+8, headerLength);
 	assert(tensorData != NULL);
 	int tensorParameterSize = cJSON_GetArraySize(tensorData)-1;
 	assert(tensorParameterSize > 0);
 	//printf("%ld %d\n", headerLength, tensorParameterSize);
 	//char *formatted_json = cJSON_Print(tensorData);if(formatted_json != NULL){printf("%s\n", formatted_json);free(formatted_json);}

 	assert(munmap(fileData, fileSize) != -1);
 	fclose(fp);
 	return tensorData;
 }

 size_t GetTensorOffset(cJSON *tensorData, char *tensorName)
 {
 	size_t tensorOffset = 0;
 	cJSON *item = NULL;
 	cJSON *offset = NULL;
 	cJSON *dtype = NULL;
 	cJSON *data_offsets = NULL;
 	cJSON *shape = NULL;
 	cJSON *eachShape = NULL;
 	cJSON_ArrayForEach(item, tensorData)
 	{
 		dtype = cJSON_GetObjectItem(item, "dtype");data_offsets = cJSON_GetObjectItem(item, "data_offsets");
 		shape = cJSON_GetObjectItem(item, "shape");
 		if(dtype && data_offsets && shape)
 		{
 			if(strcmp(tensorName, item->string) == 0)
 			{
 				//printf("Key: %s\n", item->string);printf("  dtype: %s\n", dtype->valuestring);printf("  data_offsets: ");	
 				cJSON_ArrayForEach(offset, data_offsets)
 				{
 					tensorOffset = (size_t) offset->valuedouble;break;
 				}
 				break;
 			}
 		}
 	}
 	return tensorOffset;
 }

 int main()
 {
 	char *vocabularyFileName = "../Datasets/GPT2Tensors/enc";
 	char *textFileName = "../Datasets/GPT2Tensors/data";
 	char *safeTensorFileName = "../Datasets/GPT2Tensors/model.safetensors";
 	
 	size_t tokenCount = 0;
 	Decoder tokenDecoder = LoadTokenDecoder(vocabularyFileName);
 	uint16_t *tokenizedData = GetTokenizedData(&tokenCount, textFileName, tokenDecoder);
 	
 	ModelParameters parameters = malloc(sizeof(struct model_parameter_struct));
 	ModelActivations activations = malloc(sizeof(struct model_activation_struct));
 	cJSON *tensorData = LoadSafeTensorData(safeTensorFileName);
 	size_t tensorOffset = GetTensorOffset(tensorData, "h.3.ln_2.bias");
 	printf("%ld\n", tensorOffset);
 	
 	
 	free(activations);
 	free(parameters);
 	free(tokenizedData);
 	free(tokenDecoder->rawData);
 	for(size_t i = 0; i < VOCABULARY_SIZE; i++){free(tokenDecoder->tokens[i]);}
 	free(tokenDecoder->tokens);
 	free(tokenDecoder);
 	cJSON_Delete(tensorData);
 	return 0;
 }
	#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <sys/mman.h>
	#include "cJSON.h"
	#define VOCABULARY_SIZE 50257
	#define tf_d_vocab 50257
	#define tf_d_seq 1024
	#define tf_d_model 768
	#define tf_d_k 64
	#define tf_n_heads 12
	#define tf_n_layers 12
	#define tf_rsqrt_d_k 0.125f
	//clear && gcc 25_GPT2.c cJSON.c -lm -o m.o && ./m.o
	enum SAFETENSORS_DTYPES
	{
	SAFETENSORS_F64 = 0,
	SAFETENSORS_F32,
	SAFETENSORS_F16,
	SAFETENSORS_BF16,
	SAFETENSORS_I64,
	SAFETENSORS_I32,
	SAFETENSORS_I16,
	SAFETENSORS_I8,
	SAFETENSORS_U8,
	SAFETENSORS_BOOL,
	SAFETENSORS_NUM_DTYPES
	};
	char dataTypes_String_Safetensors[SAFETENSORS_NUM_DTYPES][20] ={"F64","F32","F16","BF16","I64","I32","I16","I8","U8","BOOL"};
	int GetSafetensorSize(int dtype)
	{
	switch(dtype)
	{
	case SAFETENSORS_F64: return 8;
	case SAFETENSORS_F32: return 4;
	case SAFETENSORS_F16: return 2;
	case SAFETENSORS_BF16: return 2;
	case SAFETENSORS_I64: return 8;
	case SAFETENSORS_I32: return 4;
	case SAFETENSORS_I16: return 2;
	case SAFETENSORS_I8: return 1;
	case SAFETENSORS_U8: return 1;
	case SAFETENSORS_BOOL: return 1;
	}
	return 0;
	}
	typedef struct token_struct *Token;
	typedef struct decoder_struct *Decoder;
	typedef struct model_parameter_struct *ModelParameters;
	typedef struct model_activation_struct *ModelActivations;
	struct token_struct
	{
	uint32_t offset;
	uint32_t size;
	};
	struct decoder_struct
	{
	Token *tokens;
	char *rawData;
	size_t rawDataLength;
	};
	struct model_parameter_struct
	{
	struct{float *weight;}wte;
	struct{float *weight;}wpe;
	struct
	{
	struct{float bias; float weight;}ln_1;
	struct
	{
	struct{float bias; float weight;}c_attn;
	struct{float bias; float weight;}c_proj;
	}attn;
	struct{float bias; float weight;}ln_2;
	struct
	{
	struct{float bias; float weight;}c_fc;
	struct{float bias; float weight;}c_proj;
	}mlp;
	}h[12];
	struct{float bias; float weight;}ln_f;
	};

	struct model_activation_struct
	{
	struct{float out[tf_d_seq][tf_d_model];}embedding;
	struct
	{
	struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_1;
	struct
	{
	struct{float out[tf_d_seq][3 * tf_d_model];}c_attn;
	struct{float out[tf_n_heads][tf_d_seq][tf_d_seq];}softmax;
	struct{float out[tf_d_seq][tf_d_model];}z;
	struct{float out[tf_d_seq][tf_d_model];}c_proj;
	}attn;
	struct{float out[tf_d_seq][tf_d_model];}res_1;
	struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_2;
	struct
	{
	struct{float out[tf_d_seq][4 * tf_d_model];} c_fc;
	struct{float out[tf_d_seq][4 * tf_d_model];}gelu;
	struct{float out[tf_d_seq][tf_d_model];}c_proj;
	}mlp;
	struct{float out[tf_d_seq][tf_d_model];}res_2;
	}h[12];
	struct{float r_std[tf_d_seq];float mean[tf_d_seq];float out[tf_d_seq][tf_d_model];}ln_f;
	struct{float out[tf_d_seq][tf_d_vocab];} unembedding;
	};

	size_t GetFileSize(char *fileName)
	{
	FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
	fseek(fp, 0L, SEEK_END);
	size_t currentFileSize = ftell(fp);rewind(fp);
	fclose(fp);
	return currentFileSize;
	}

	Decoder LoadTokenDecoder(char *vocabularyFileName)
	{
	/Open the vocabulary file/
	size_t inputLength = GetFileSize(vocabularyFileName);
	FILE *fp = fopen(vocabularyFileName, "rb");assert(fp != NULL);
	int fileNumber = fileno(fp);
	unsigned char *input = mmap(NULL,inputLength, PROT_READ\|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);

	Decoder tokenDecoder = malloc(sizeof(struct decoder_struct));
	tokenDecoder->tokens = malloc(VOCABULARY_SIZE * sizeof(Token));
	tokenDecoder->rawDataLength = inputLength - VOCABULARY_SIZE*8;
	tokenDecoder->rawData= malloc(tokenDecoder->rawDataLength * sizeof(unsigned char));

	memcpy(tokenDecoder->rawData, input + (VOCABULARY_SIZE * 8), tokenDecoder->rawDataLength * sizeof(unsigned char));
	uint32_t tokenValue = 0;
	uint32_t tokenOffset = 0;
	uint32_t tokenSize = 0;
	for(size_t i = 0, k = 0; i < VOCABULARY_SIZE*8; i += 8, k+=1)
	{
	tokenDecoder->tokens[k] = malloc(sizeof(struct token_struct));
	tokenDecoder->tokens[k]->offset = 0;
	tokenDecoder->tokens[k]->size = 0;
	for(size_t j = i + 3; j-- > i;)
	{
	tokenDecoder->tokens[k]->offset <<= 8;
	tokenDecoder->tokens[k]->offset += input[j];
	//printf("%u\n", input[j]);
	}
	for(int j = i+7; j >= i + 4; j--)
	{
	tokenSize <<= 8;
	tokenDecoder->tokens[k]->size += input[j];
	//printf("%u\n", input[j]);
	}
	//printf("(%ld %u %u : %.*s)\n", k, tokenDecoder->tokens[k]->offset, tokenDecoder->tokens[k]->size,tokenDecoder->tokens[k]->size, tokenDecoder->rawData + tokenDecoder->tokens[k]->offset);
	}
	assert(munmap(input, inputLength) != -1);
	fclose(fp);
	return tokenDecoder;
	}

	uint16_t GetTokenizedData(size_t tokenCount, char *fileName, Decoder tokenDecoder)
	{
	size_t inputLength = GetFileSize(fileName);
	FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
	int fileNumber = fileno(fp);
	unsigned char *input = mmap(NULL,inputLength, PROT_READ\|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(input != NULL);
	//printf("%ld\n", inputLength);
	uint16_t *tokenizedData = calloc(inputLength / 2, sizeof(uint16_t));
	for(size_t i = 0, k = 0; i < inputLength; i += 2, k++)
	{
	tokenizedData[k] = 0;
	tokenizedData[k] += input[i+1];
	tokenizedData[k] <<= 8;
	tokenizedData[k] += input[i];
	uint16_t token = tokenizedData[k];
	assert(token < VOCABULARY_SIZE);
	uint32_t offset = tokenDecoder->tokens[token]->offset;
	uint32_t size = tokenDecoder->tokens[token]->size;
	//printf("(%ld %u %u : %.*s)", k, offset, size,size, tokenDecoder->rawData + offset);
	}

	assert(munmap(input, inputLength) != -1);
	fclose(fp);
	return tokenizedData;
	}

	cJSON LoadSafeTensorData(char fileName)
	{
	size_t fileSize = GetFileSize(fileName);
	FILE *fp = fopen(fileName, "rb");assert(fp != NULL);
	int fileNumber = fileno(fp);
	unsigned char *fileData = mmap(NULL,fileSize, PROT_READ\|PROT_WRITE, MAP_PRIVATE, fileNumber, 0);assert(fileData != NULL);
	/Read HeaderLength(1st 8 bytes in reverse)/
	size_t headerLength = 0;for(int i = 7; i >= 0; i--){headerLength <<= 8;headerLength += fileData[i];}assert(headerLength >= 0);
	assert(fileData[8] == '{');

	cJSON *tensorData = cJSON_ParseWithLength(fileData+8, headerLength);
	assert(tensorData != NULL);
	int tensorParameterSize = cJSON_GetArraySize(tensorData)-1;
	assert(tensorParameterSize > 0);
	//printf("%ld %d\n", headerLength, tensorParameterSize);
	//char *formatted_json = cJSON_Print(tensorData);if(formatted_json != NULL){printf("%s\n", formatted_json);free(formatted_json);}

	assert(munmap(fileData, fileSize) != -1);
	fclose(fp);
	return tensorData;
	}

	size_t GetTensorOffset(cJSON tensorData, char tensorName)
	{
	size_t tensorOffset = 0;
	cJSON *item = NULL;
	cJSON *offset = NULL;
	cJSON *dtype = NULL;
	cJSON *data_offsets = NULL;
	cJSON *shape = NULL;
	cJSON *eachShape = NULL;
	cJSON_ArrayForEach(item, tensorData)
	{
	dtype = cJSON_GetObjectItem(item, "dtype");data_offsets = cJSON_GetObjectItem(item, "data_offsets");
	shape = cJSON_GetObjectItem(item, "shape");
	if(dtype && data_offsets && shape)
	{
	if(strcmp(tensorName, item->string) == 0)
	{
	//printf("Key: %s\n", item->string);printf(" dtype: %s\n", dtype->valuestring);printf(" data_offsets: ");
	cJSON_ArrayForEach(offset, data_offsets)
	{
	tensorOffset = (size_t) offset->valuedouble;break;
	}
	break;
	}
	}
	}
	return tensorOffset;
	}

	int main()
	{
	char *vocabularyFileName = "../Datasets/GPT2Tensors/enc";
	char *textFileName = "../Datasets/GPT2Tensors/data";
	char *safeTensorFileName = "../Datasets/GPT2Tensors/model.safetensors";

	size_t tokenCount = 0;
	Decoder tokenDecoder = LoadTokenDecoder(vocabularyFileName);
	uint16_t *tokenizedData = GetTokenizedData(&tokenCount, textFileName, tokenDecoder);

	ModelParameters parameters = malloc(sizeof(struct model_parameter_struct));
	ModelActivations activations = malloc(sizeof(struct model_activation_struct));
	cJSON *tensorData = LoadSafeTensorData(safeTensorFileName);
	size_t tensorOffset = GetTensorOffset(tensorData, "h.3.ln_2.bias");
	printf("%ld\n", tensorOffset);


	free(activations);
	free(parameters);
	free(tokenizedData);
	free(tokenDecoder->rawData);
	for(size_t i = 0; i < VOCABULARY_SIZE; i++){free(tokenDecoder->tokens[i]);}
	free(tokenDecoder->tokens);
	free(tokenDecoder);
	cJSON_Delete(tensorData);
	return 0;
	}