Skip to content

Instantly share code, notes, and snippets.

@ericfode
Created March 30, 2010 05:11
Show Gist options
  • Save ericfode/348781 to your computer and use it in GitHub Desktop.
Save ericfode/348781 to your computer and use it in GitHub Desktop.
#include "stdlib.h"
#include "stdio.h"
#include "malloc.h"
typedef struct genePart* GenePart;
typedef struct geneData* GeneData;
typedef struct geneNode* GeneNode;
typedef u_char byte;
typedef struct bruteTreeTrainer* BruteTreeTrainer;
typedef struct geneDataSlice* GeneDataSlice;
typedef struct trainingData* TrainingData;
typedef unsigned int uint;
#define NextNodeBound 2
#define MinLen 6
#define MaxDepth 20
//used in tree
struct genePart{
byte DataValue;
uint HitNumber;
GeneNode Next;
};
//used in list
struct geneData{
byte* DataValue;
uint len;
uint HitNumber;
};
struct geneNode{
byte DataValue;
byte Depth;
uint HitNumber;
GenePart* GeneParts;
};
struct geneDataSlice{
uint len;
GeneData* Data;
};
struct bruteTreeTrainer{
GeneNode root;
GeneDataSlice curHighest;
};
struct trainingData{
byte* Data;
uint len;
};
unsigned long long totalMem;
void* CountedMalloc(int size) {
totalMem += size;
// printf("mem allocted: %d total mem: %lld\n",size,totalMem);
void* newptr = malloc(size);
if (newptr == 0){
printf("out of memory\n");
exit(1);
}
return newptr;
}
void CountedFree(void* ptr) {
totalMem -= sizeof(ptr);
free(ptr);
}
int cmpGeneData(GeneData right, GeneData left){
int index= 0;
if (right->len != left->len){ return 0; }
for( index=0; index < right->len; index++){
if(right->DataValue[index] != left->DataValue[index]){ return 0; }
}
return 1;
}
GeneData newGD(int size){
GeneData GD = CountedMalloc(sizeof(GeneData*));
GD->DataValue = CountedMalloc(sizeof(byte)*size);
GD->len = size;
return GD;
}
//returns an array that can contain pointers YOU NEED TO INI THE GD YOUR SELF
GeneDataSlice newGDS(int size){
GeneDataSlice gds = CountedMalloc(sizeof(GeneDataSlice*));
gds->Data = CountedMalloc(sizeof(GeneData)*size);
int index = 0;
gds->len=size;
return gds;
}
GeneNode newGN(){
GeneNode GN = CountedMalloc(sizeof(GeneNode*));
GN->HitNumber = 0;
return GN;
}
TrainingData newTD(){
TrainingData TD = malloc(sizeof(TrainingData*));
TD->len = 0;
return TD;
}
int d =0;
GeneNode newTree() {
printf("creating tree\n");
GeneNode root = CountedMalloc(sizeof(GeneNode*));
printf("setting vars\n");
root->Depth = 0;
root->DataValue = 0;
root->HitNumber = 0;
uint index =0;
printf("zeroing\n");
root->GeneParts = 0;
printf("allocaing geneparts\n");
root->GeneParts = CountedMalloc(sizeof(GenePart)*256);
printf("allocated Genepart\n");
for(index =0; index < 256; index++){
root->GeneParts[index]= CountedMalloc(sizeof(GenePart*));
// printf("creating node\n");
root->GeneParts[index]->DataValue = index;
// printf("indexing node\n");
root->GeneParts[index]->HitNumber = 0;
// printf("hitNumbering node\n");
root->GeneParts[index]->Next =0;
// printf("treeIndex : %d, TreeHitnumber: %d\n",index,root->GeneParts[index]->HitNumber);
}
// if(d==1){
// for(index =0; index < 256; index++){
// printf("treeIndex : %d, TreeHitnumber: %d\n",index,root->GeneParts[index]->HitNumber);
// }
printf("tree ready\n");
//}
return root;
}
void testTree(GeneNode gn){
int index= 1;
for(index =0; index < 256; index++){
GenePart curGenePart = gn->GeneParts[index];
printf("treeIndex : %d, TreeHitnumber: %d\n",index,curGenePart->HitNumber);
}
return;
}
BruteTreeTrainer newBTT(){
printf("stage1 tree creation\n");
BruteTreeTrainer btt = CountedMalloc(sizeof(BruteTreeTrainer*));
printf("tree alllocated\n");
btt->root = newTree();
// testTree(btt->root);
btt->curHighest = newGDS(0);
return btt;
}
int numChildren(GeneNode gn) {
int num = 0;
int i =0;
//printf("in numChildren\n");
for(i = 0; i < 256; i++){
if( gn->GeneParts[i]->Next != 0) {
num++;
}
}
return num;
}
int numLeafs(GeneNode gn) {
int tally, index, total;
GenePart curGP;
for(index =0; index < 256; curGP = gn->GeneParts[index], index++ ){
if (curGP->Next != 0){
total += numLeafs(curGP->Next);
}
}
if (numChildren(gn) == 0) {
//because this is a leaf
total = 1;
}
return total;
}
//destructive to both the left and right pointers
GeneDataSlice joinGeneDataSlice(GeneDataSlice right, GeneDataSlice left) {
//printf("joining\n");
GeneDataSlice GDS = newGDS(right->len+left->len);
int index = 0;
for( index= 0 ; index < right->len; index++) {
GDS->Data[index] = right->Data[index];
}
for( index = 0; index < left ->len; index++) {
GDS->Data[index+right->len] = left->Data[index];
}
CountedFree(left);
CountedFree(right);
return GDS;
}
GeneDataSlice mergeGeneDataSlice(GeneDataSlice right, GeneDataSlice left) {
int dups = 0;
GeneData curGDl = 0;
GeneData curGDr = 0;
//first pass to determine the size of the new list
//for ever item on the right side
int indexl = 0;
int indexr = 0;
for(indexr = 0; indexr < right->len; indexr++ ){
curGDr = right->Data[indexr];
//for every item on the left side
for(indexl = 0; indexl < left->len; indexl++ ){
curGDl = left->Data[indexl];
//if the byte slices are equal
if(1 == cmpGeneData(curGDr,curGDl)){
//add anouther tally to the dups list
dups++;
break;
}
}
}
printf("%d Dups\n",dups);
GeneDataSlice GDS = newGDS((right->len+left->len) -dups);
//copy the right side over
for(indexr = 0; indexr < right->len; indexr++){
curGDr = right->Data[indexr];
GDS->Data[indexr] = curGDr;
}
//verifier
// for(indexr = 0; indexr < GDS->len; curGDr = GDS->Data[indexr]){
// indexr++;
// // GDS->Data[indexr] = right->Data[indexr];
// printf("coping r %d\n",GDS->Data[indexr]->DataValue);
// }
printf("copying leftside\n");
//current offset of left side
int offset = 0;
//for every item on the left side check to see if there is already an equal in the list
for(indexl = 0; indexl < left->len; indexl++){
curGDl = left->Data[indexl];
int matched = 0;
for(indexr = 0; indexr < right->len;indexr++ ){
curGDr = GDS->Data[indexr];
if(1 == cmpGeneData(curGDr,curGDl)){
GDS->Data[indexr]->HitNumber += curGDl->HitNumber;
CountedFree(curGDl->DataValue);
CountedFree(curGDl);
matched = 1;
break;
}
}
if (matched != 1) {
//tally starts at zero because the len of the right will be one index of the
//end of the list
GDS->Data[right->len+offset] = curGDl;
// printf("coping leftside len:%d data:%d index:%d\n",curGDl->len, curGDl->DataValue, right->len+offset);
offset++;
}
}
GDS->len = right->len+(offset-1);
printf("merged len %d\n", GDS->len);
CountedFree(right);
CountedFree(left);
return GDS;
}
GeneDataSlice collapseLeafs(GeneNode gn) {
//Create a geneDataSlice to hold the results
// printf("in collapse leafs\n");
//if this is a leaf
if(numChildren(gn) == 0) {
//create a GDS with room for one node
GeneDataSlice GDS = newGDS(1);
//printf("in leaf node \n");
//then make a new geneDataSlice
// printf("created GDS hitnumber:%d depth:%d dataValue:%d\n",gn->HitNumber,gn->Depth, gn->DataValue);
GDS->Data[0] = newGD(gn->Depth);
(GDS->Data[0])->HitNumber = gn->HitNumber;
// printf("got hitnumber\n");
// printf("created GeneData\n");
GDS->Data[0]->DataValue[gn->Depth-1] = gn->DataValue;
return GDS;
}
//if this is a brach
//join all of the leafs together
//add this data value to the depth place in all of the slices
int index =0;
//create a GDS just to have something to return to
GeneDataSlice GDS = newGDS(0);
//printf("in branch children:%d\n",numChildren(gn));
for(index =0; index < 256; index++){
GenePart child = gn->GeneParts[index];
if(child->Next != 0) {
// printf("on index:%d\n",index);
//GDS contains a running "total" of all of leafs
GDS = joinGeneDataSlice(GDS,collapseLeafs(child->Next));
}
}
//if this is not a leaf
//printf("checking depth\n");
if( gn->Depth > 0 ){
//printf("copyinging data GDSlen:%d depth:%d datavalue:%d\n",GDS->len,gn->Depth,gn->DataValue);
//for every item in the current GeneDataSlice, add this nodes data value to the datavalues that are stored
//???Somethign is wrong with last item of the GDS...????//
for (index = 0; index < GDS->len; index++){
// printf("on index %d len:%d\n",index,GDS->Data[index]->len);
GDS->Data[index]->DataValue[gn->Depth]= gn->DataValue;
}
}
// printf("returning from collapseLeafs\n");
return GDS;
}
void deleteLeafs(GeneNode node){
int index = 0;
for(index = 0; index < 256; index++) {
GenePart child = node->GeneParts[index];
if (child->Next != 0){
deleteLeafs(child->Next);
CountedFree(child->Next);
child->Next = 0;
}
// many need to add an extra free here
}
return;
}
//byte is named b because in go a byte is also a byte and this byte is being used as a byte
void addNode(GeneNode gn,byte b){
// printf("testing next\n");
gn->GeneParts[b]->Next = 0;
// printf("creating new tree for node\n");
gn->GeneParts[b]->Next = newTree();
//printf("doing depth calc\n");
gn->GeneParts[b]->Next->Depth = gn->Depth + 1;
//printf("doing datavalue clac\n");
gn->GeneParts[b]->Next->DataValue = b;
int index = 0;
//initilize the new gene GeneParts
//printf("doing initilzation of geneparts\n");
for(index = 0; index < 256; index++) {
gn->GeneParts[b]->Next->GeneParts[index]->DataValue = index;
gn->GeneParts[b]->Next->GeneParts[index]->HitNumber = 0;
}
return;
}
void addByteSlice(GeneNode gn, byte* b,byte* end) {
GeneNode curGN = gn;
// printf("adding byte:%d\n",b[0]);
while(1){
curGN->GeneParts[b[0]]->HitNumber++;
curGN->HitNumber++;
// printf("added byte.. hitnumber:%d\n",curGN->GeneParts[b[0]]->HitNumber);
if( curGN->GeneParts[b[0]]->HitNumber == NextNodeBound){
// printf("adding node\n");
addNode(curGN,b[0]);
// printf("added node\n");
}
if( curGN->GeneParts[b[0]]->HitNumber > NextNodeBound) {
if (b != end && curGN->Depth < MaxDepth) {
curGN = curGN->GeneParts[b[0]]->Next;
b++;
continue;
}
}
// printf("finished addByteSlice\n");
return;
}
}
void PrintStats(GeneDataSlice gd){
printf("working on stats\n");
float avarageLen = 0;
int maxLen = 0;
int minLen = 500000;
int maxO = 0;
int minO = 500000;
float avarageO = 0;
int index = 0;
for(index = 0; index < gd->len; index++){
// printf("on index:%d addresss:%d \n", index,gd->Data[index]);
GeneData cur= gd->Data[index];
if (cur->len > maxLen) {
maxLen = cur->len;
}
if (cur->len < minLen) {
minLen = cur->len;
}
avarageLen += cur->len;
if (cur->HitNumber > maxO) {
maxO = cur->HitNumber;
}
if (cur->HitNumber < minO) {
minO = cur->HitNumber;
}
avarageO += cur->HitNumber;
}
avarageLen /= gd->len;
avarageO /= gd->len;
printf("\n");
printf("min Length: %d max Length: %d a Length: %f min Hits: %d max Hits: %d a Hits: %f total: %d\n"
, minLen, maxLen,avarageLen,minO,maxO,avarageO,gd->len);
d= 1;
return;
}
void BTTrain(BruteTreeTrainer bt, TrainingData td){
printf("starting Train \n");
int index;
for( index = 0 ; index < td->len; index++){
// printf("%d\n",index);
addByteSlice(bt->root,&(td->Data[index]),td->Data+td->len);
if(index % 50000 == 0 && index != 0){
printf("collapsing\n");
GeneDataSlice temp = collapseLeafs(bt->root);
printf("trying to merge\n");
bt->curHighest = mergeGeneDataSlice(temp,bt->curHighest );
printf("curHighest len:%d\n",bt->curHighest->len);
PrintStats(bt->curHighest);
deleteLeafs(bt->root);
// CountedFree(bt->root);
bt->root= newTree();
printf("adding byte\n");
}
}
return;
}
int main(){
printf("starting\n");
BruteTreeTrainer btt = newBTT();
TrainingData TD = newTD();
printf("created btt an td\n");
TD->Data = malloc(sizeof(byte)*60 *1000000);
TD->len = 60 *1000000;
int index =0;
for(index =0; index < TD->len; index++){
TD->Data[index] = (byte)rand();
}
printf("made really big td\n");
BTTrain(btt, TD);
//free(TD->Data);
//free(TD);
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment