Skip to content

Instantly share code, notes, and snippets.

@andrewhodel
Created June 20, 2019 19:51
Show Gist options
  • Save andrewhodel/08d225619d3dbcca9f33210f0e868c48 to your computer and use it in GitHub Desktop.
Save andrewhodel/08d225619d3dbcca9f33210f0e868c48 to your computer and use it in GitHub Desktop.
/*
Copyright 2016 Andrew Hodel
[email protected]
LICENSE
This program, design, source code, document or sequence of bits is licensed and the terms issued below must be followed. By using or reading or including this content you are automatically a licensee granted permission by the licensor (Andrew Hodel) under the following terms.
Usage - You may use this content under the following conditions:
1. Every inclusion of this content within another program, design, source code, document or sequence of bits requires the licensee to notify the licensor via email to the email address [email protected]. The message must clearly explain the intended usage, information on the direction of the project to which it is to be included, and the Given Birth Name of the person who is writing the email and using the content as well as the the Company name (if applicable).
*/
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
// a haystack is a uint8_t[] which has the following characteristics
// the first uint32_t (4 bytes) represents the total count of unique words in the haystack
// the next uint64_t (8 bytes) represents the total length of the haystack
// then this structure repeats [string][\0][uint16_t count of string]
uint64_t haystackLen(FILE *haystack) {
//return (haystack[4]<<24) | (haystack[5]<<16) | (haystack[6]<<8) | (haystack[7]) | (haystack[8]<<24) | (haystack[9]<<16) | (haystack[10]<<8) | (haystack[11]);
fseeko(haystack, 4, SEEK_SET);
uint64_t len;
fread(&len, sizeof(len), 1, haystack);
return len;
}
uint32_t haystackCount(FILE *haystack) {
//return (haystack[0]<<24) | (haystack[1]<<16) | (haystack[2]<<8) | (haystack[3]);
fseeko(haystack, 0, SEEK_SET);
uint32_t count;
fread(&count, sizeof(count), 1, haystack);
return count;
}
FILE *haystackAdd(FILE *haystack, char str[], uint64_t fixedLen) {
uint32_t count = haystackCount(haystack);
uint64_t len = haystackLen(haystack);
//printf("adding string %s to haystack\n", str);
//printf("count %u\n", count);
//printf("len %lu\n", len);
// check if the string already exists
fseeko(haystack, 0, SEEK_END);
uint64_t fileSize = ftello(haystack);
fseeko(haystack, 12, SEEK_SET);
uint64_t c = 11;
char word[200];
uint64_t wc = 0;
uint8_t found = 0;
char byte;
while (c < fileSize && found != 1) {
byte = getc(haystack);
if (fixedLen == 0) {
if (byte == '\0') {
// need to have this second block because technically a fixed length string could have \0 as a char
// this is the end of a word which is wc characters long, see if it matches str
uint64_t cc = 0;
uint64_t is = 0;
while (cc<wc) {
if (word[cc] == str[cc]) {
is++;
}
cc++;
}
if (is == cc && is == strlen(str)) {
//printf("\tfound existing word\n");
// word is existing, increment counter which will start at c+12+1
//uint16_t singleCount = (haystack[c+12+1]<<8) | (haystack[c+12+2]<<0);
uint16_t singleCount;
fread(&singleCount, sizeof(singleCount), 1, haystack);
//printf("existing count %u\n", singleCount);
singleCount++;
fseeko(haystack, ftello(haystack)-sizeof(singleCount), SEEK_SET);
fwrite(&singleCount, sizeof(singleCount), 1, haystack);
found = 1;
break;
} else {
// skip past counter
fseeko(haystack, ftello(haystack)+sizeof(uint16_t), SEEK_SET);
wc = 0;
continue;
}
wc = 0;
}
} else {
// this is a fixed length string
if (wc == fixedLen-1) {
// this is the end of a word
// this is the end of a word which is wc characters long, see if it matches str
uint64_t cc = 0;
uint64_t is = 0;
while (cc<wc) {
if (word[cc] == str[cc]) {
is++;
}
cc++;
}
if (is == cc && is == fixedLen-1) {
//printf("\tfound existing word\n");
// word is existing, increment counter which will start at c+12+1
//uint16_t singleCount = (haystack[c+12+1]<<8) | (haystack[c+12+2]<<0);
uint16_t singleCount;
fread(&singleCount, sizeof(singleCount), 1, haystack);
//printf("existing count %u\n", singleCount);
singleCount++;
fseeko(haystack, ftello(haystack)-sizeof(singleCount), SEEK_SET);
fwrite(&singleCount, sizeof(singleCount), 1, haystack);
found = 1;
break;
} else {
// skip past counter
fseeko(haystack, ftello(haystack)+sizeof(uint16_t), SEEK_SET);
wc = 0;
continue;
}
wc = 0;
}
}
word[wc] = byte;
wc++;
c++;
}
if (found == 0) {
fseeko(haystack, 0, SEEK_END);
// add string and 2 byte count of 1 to haystack
// file pointer would be at eof
uint64_t cc = 0;
//printf("adding ");
if (fixedLen == 0) {
while (cc<strlen(str)) {
putc(str[cc], haystack);
//printf("%c", str[cc]);
cc++;
}
putc('\0', haystack);
} else {
while (cc<fixedLen) {
putc(str[cc], haystack);
//printf("%c", str[cc]);
cc++;
}
}
//printf(" ending at %lu\n", ftello(haystack));
// add count of 1
uint16_t singleCount = 1;
fwrite(&singleCount, sizeof(singleCount), 1, haystack);
// increment count
count++;
fseeko(haystack, 0, SEEK_SET);
fwrite(&count, sizeof(count), 1, haystack);
// add cc, NUL and word count to len
len += cc+1+sizeof(singleCount);
if (fixedLen > 0) {
// subtract NUL from len
len--;
}
fseeko(haystack, 4, SEEK_SET);
fwrite(&len, sizeof(len), 1, haystack);
}
return haystack;
}
FILE *haystackRemove(FILE *haystack, char *haystackFilename, char *tempFilename, char *string, uint64_t fixedLen) {
// generate a temporary file to rewrite to
FILE *temp;
temp = fopen(tempFilename, "wb+");
// copy the first 12 bytes from haystack to temp
// this never changes in sort, only when you are adding or removing
fseeko(haystack, 0, SEEK_SET);
uint64_t c = 0;
char byte;
while (c<12) {
byte = getc(haystack);
putc(byte, temp);
c++;
}
uint32_t count = haystackCount(haystack);
uint64_t len = haystackLen(haystack);
//printf("count %u\n", count);
// seek haystack to after count and len
fseeko(haystack, 12, SEEK_SET);
// find word with current greatest count
uint64_t cc = 0LLU;
uint64_t wc = 0LLU;
char word[200];
uint64_t startsAt = 0LLU;
uint64_t didRemove = 0LLU;
// loop through the haystack
while (cc<len) {
if (fixedLen == 0) {
// need to have this second block because technically a fixed length string could have \0 as a char
if (byte == '\0' && cc > 0) {
//printf("remove routine found word %s at %lu\n", word, ftello(haystack)-wc);
// check if the word is the same as string
if (strncmp(word, string, wc) == 0) {
// don't add it
//printf("\t is matching word, removing it\n");
fseeko(haystack, ftello(haystack)+2, SEEK_SET);
didRemove = wc+(uint64_t)2LLU;
} else {
// add it
c = 0;
while (c<wc) {
putc(word[c], temp);
c++;
}
// add counter, 2 bytes
uint16_t localCount;
fread(&localCount, sizeof(localCount), 1, haystack);
fwrite(&localCount, sizeof(localCount), 1, temp);
}
wc = 0;
cc+= 2;
}
} else {
// this is fixed length remove
if (wc == fixedLen) {
// check if the word is the same as string
if (strncmp(word, string, wc) == 0) {
// don't add it
fseeko(haystack, ftello(haystack)+2, SEEK_SET);
didRemove = wc+(uint64_t)2LLU;
} else {
// add it
c = 0;
while (c<wc) {
putc(word[c], temp);
c++;
}
// add counter, 2 bytes
uint16_t localCount;
fread(&localCount, sizeof(localCount), 1, haystack);
fwrite(&localCount, sizeof(localCount), 1, temp);
}
wc = 0;
cc+= 2;
}
}
if (wc == 0) {
startsAt = ftello(haystack);
}
byte = getc(haystack);
word[wc] = byte;
cc++;
wc++;
}
// must erase current haystack
fclose(haystack);
// open wb+ so it creates an empty file
fopen(haystackFilename, "wb+");
if (didRemove > 0) {
// set temp count -1
count = haystackCount(temp);
count--;
fseeko(temp, 0, SEEK_SET);
fwrite(&count, 1, sizeof(count), temp);
// set temp len -didRemove
len = haystackLen(temp);
len -= didRemove;
fseeko(temp, 4, SEEK_SET);
fwrite(&len, 1, sizeof(len), temp);
}
fseeko(temp, 0, SEEK_END);
uint64_t fileSize = ftello(temp);
// now write temp to haystack
fseeko(temp, 0, SEEK_SET);
uint64_t filePos = 0;
while (filePos < fileSize) {
byte = getc(temp);
putc(byte, haystack);
filePos++;
}
// close and remove temp
fclose(temp);
remove(tempFilename);
// then return the modified haystack
return haystack;
}
FILE *haystackSort(FILE *haystack, char *tempFilename, uint64_t fixedLen) {
// generate a temporary file to rewrite to
FILE *temp;
temp = fopen(tempFilename, "wb+");
// go to start of haystack
fseeko(haystack, 0, SEEK_SET);
// copy the first 12 bytes from haystack to temp
// this never changes in sort, only when you are adding or removing
uint64_t c = 0;
char byte;
while (c<12) {
byte = getc(haystack);
putc(byte, temp);
c++;
}
c = 0;
uint64_t currentTempPos = 12;
uint32_t count = haystackCount(haystack);
uint64_t len = haystackLen(haystack);
//printf("count %u\n", count);
// do this haystackCount times
while (c<count) {
// seek haystack to after count and len
fseeko(haystack, 12, SEEK_SET);
// find word with current greatest count
uint64_t cc = 0;
uint64_t wc = 0;
uint16_t currentGreatestCount = 0;
uint64_t greatestCountPosition = 0;
uint8_t word[200];
uint64_t startsAt = 0;
while (cc<len) {
if (fixedLen == 0) {
// need to have this second block because technically a fixed length string could have \0 as a char
if (byte == '\0' && cc > 0) {
// get the count
uint16_t singleCount;
fread(&singleCount, sizeof(singleCount), 1, haystack);
if (singleCount > currentGreatestCount) {
currentGreatestCount = singleCount;
greatestCountPosition = startsAt;
}
wc = 0;
cc+= 2;
}
} else {
// this is fixed length sort
if (wc == fixedLen) {
// get the count
uint16_t singleCount;
fread(&singleCount, sizeof(singleCount), 1, haystack);
if (singleCount > currentGreatestCount) {
currentGreatestCount = singleCount;
greatestCountPosition = startsAt;
}
wc = 0;
cc+= 2;
}
}
if (wc == 0) {
startsAt = ftello(haystack);
}
byte = getc(haystack);
word[wc] = byte;
cc++;
wc++;
}
//printf("greatestCountPosition %lu\n", greatestCountPosition);
//printf("currentGreatestCount %hu\n", currentGreatestCount);
// now add the word at greatestCountPosition to temp
fseeko(haystack, greatestCountPosition, SEEK_SET);
fseeko(temp, currentTempPos, SEEK_SET);
uint64_t cr = 0;
while (1) {
byte = getc(haystack);
//printf("%c %u grabbing from %ld, inserting at %lu\n", byte, byte, ftello(haystack)-1, currentTempPos);
putc(byte, temp);
if (byte == '\0' && fixedLen == 0) {
break;
} else if (cr == fixedLen-1) {
break;
}
cr++;
}
//printf("\n");
// set the current greatest word in haystack's count to 0
uint16_t singleCount = 0;
//printf("setting count to 0 in haystack at %ld\n", ftello(haystack));
fwrite(&singleCount, sizeof(singleCount), 1, haystack);
// then write the currentGreatestCount to temp
fwrite(&currentGreatestCount, sizeof(currentGreatestCount), 1, temp);
currentTempPos = ftello(temp);
c++;
}
fseeko(temp, 0, SEEK_END);
uint64_t fileSize = ftello(temp);
// now write temp to haystack
fseeko(haystack, 0, SEEK_SET);
fseeko(temp, 0, SEEK_SET);
uint64_t filePos = 0;
while (filePos < fileSize) {
byte = getc(temp);
putc(byte, haystack);
filePos++;
}
// close and remove temp
fclose(temp);
remove(tempFilename);
// then return the modified haystack
return haystack;
}
FILE *haystackInit(char *filename) {
FILE *haystack = fopen(filename, "wb+");
uint64_t c = 0;
while (c<12) {
putc(0, haystack);
c++;
}
return haystack;
}
int main() {
FILE *haystack;
// init the haystack, expects a filename for argument
// and returns a file pointer
// it's best to use files which exist on a ramdisk
haystack = haystackInit("ramdisk/temp");
uint8_t showFixed = 1;
if (showFixed == 0) {
// add some variable length strings to the haystack
// strings added multiple times will have their counter incremented
// and can be sorted by that after all the additions are done
uint32_t c = 0;
while (c<500) {
haystack = haystackAdd(haystack, "test", 0);
haystack = haystackAdd(haystack, "test", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackAdd(haystack, "test", 0);
haystack = haystackAdd(haystack, "new", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackAdd(haystack, "wtf", 0);
haystack = haystackRemove(haystack, "ramdisk/temp", "ramdisk/temp1", "test", 0);
c++;
}
haystack = haystackSort(haystack, "ramdisk/temp1", 0);
} else {
// with fixed length strings, they must all be the same length
// any longer will be ignored and the nul character wont be in the final result after each string
uint32_t c = 0;
while (c<5000) {
haystack = haystackAdd(haystack, "3f399feca50bb6be3d8c710d458fb328", 32);
haystack = haystackAdd(haystack, "3f6243c885a3eb38d83fbfa1dad04a51", 32);
haystack = haystackAdd(haystack, "7f3d2d227c6a18135090f93f643ee4ca", 32);
haystack = haystackAdd(haystack, "c390cf342008a6466bcab0d7920eff4c", 32);
haystack = haystackAdd(haystack, "c390cf342008a6466bcab0d7920eff4c", 32);
haystack = haystackRemove(haystack, "ramdisk/temp", "ramdisk/temp1", "3f399feca50bb6be3d8c710d458fb328", 32);
c++;
}
// sort the haystack by count, args haystack, tempFilename, len
haystack = haystackSort(haystack, "ramdisk/temp1", 32);
}
fseeko(haystack, 0, SEEK_END);
uint64_t fileSize = ftello(haystack);
fseeko(haystack, 0, SEEK_SET);
char byte;
uint64_t c = 0;
while (c < fileSize) {
byte = getc(haystack);
printf("%c %u\t pos %lu\t ftello %lu\n", byte, byte, c, ftello(haystack));
c++;
}
fclose(haystack);
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment