Created
June 20, 2019 19:51
-
-
Save andrewhodel/08d225619d3dbcca9f33210f0e868c48 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright 2016 Andrew Hodel | |
[email protected] | |
LICENSE | |
This program, design, source code, document or sequence of bits is licensed and the terms issued below must be followed. By using or reading or including this content you are automatically a licensee granted permission by the licensor (Andrew Hodel) under the following terms. | |
Usage - You may use this content under the following conditions: | |
1. Every inclusion of this content within another program, design, source code, document or sequence of bits requires the licensee to notify the licensor via email to the email address [email protected]. The message must clearly explain the intended usage, information on the direction of the project to which it is to be included, and the Given Birth Name of the person who is writing the email and using the content as well as the the Company name (if applicable). | |
*/ | |
#include <inttypes.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <limits.h> | |
// a haystack is a uint8_t[] which has the following characteristics | |
// the first uint32_t (4 bytes) represents the total count of unique words in the haystack | |
// the next uint64_t (8 bytes) represents the total length of the haystack | |
// then this structure repeats [string][\0][uint16_t count of string] | |
uint64_t haystackLen(FILE *haystack) { | |
//return (haystack[4]<<24) | (haystack[5]<<16) | (haystack[6]<<8) | (haystack[7]) | (haystack[8]<<24) | (haystack[9]<<16) | (haystack[10]<<8) | (haystack[11]); | |
fseeko(haystack, 4, SEEK_SET); | |
uint64_t len; | |
fread(&len, sizeof(len), 1, haystack); | |
return len; | |
} | |
uint32_t haystackCount(FILE *haystack) { | |
//return (haystack[0]<<24) | (haystack[1]<<16) | (haystack[2]<<8) | (haystack[3]); | |
fseeko(haystack, 0, SEEK_SET); | |
uint32_t count; | |
fread(&count, sizeof(count), 1, haystack); | |
return count; | |
} | |
FILE *haystackAdd(FILE *haystack, char str[], uint64_t fixedLen) { | |
uint32_t count = haystackCount(haystack); | |
uint64_t len = haystackLen(haystack); | |
//printf("adding string %s to haystack\n", str); | |
//printf("count %u\n", count); | |
//printf("len %lu\n", len); | |
// check if the string already exists | |
fseeko(haystack, 0, SEEK_END); | |
uint64_t fileSize = ftello(haystack); | |
fseeko(haystack, 12, SEEK_SET); | |
uint64_t c = 11; | |
char word[200]; | |
uint64_t wc = 0; | |
uint8_t found = 0; | |
char byte; | |
while (c < fileSize && found != 1) { | |
byte = getc(haystack); | |
if (fixedLen == 0) { | |
if (byte == '\0') { | |
// need to have this second block because technically a fixed length string could have \0 as a char | |
// this is the end of a word which is wc characters long, see if it matches str | |
uint64_t cc = 0; | |
uint64_t is = 0; | |
while (cc<wc) { | |
if (word[cc] == str[cc]) { | |
is++; | |
} | |
cc++; | |
} | |
if (is == cc && is == strlen(str)) { | |
//printf("\tfound existing word\n"); | |
// word is existing, increment counter which will start at c+12+1 | |
//uint16_t singleCount = (haystack[c+12+1]<<8) | (haystack[c+12+2]<<0); | |
uint16_t singleCount; | |
fread(&singleCount, sizeof(singleCount), 1, haystack); | |
//printf("existing count %u\n", singleCount); | |
singleCount++; | |
fseeko(haystack, ftello(haystack)-sizeof(singleCount), SEEK_SET); | |
fwrite(&singleCount, sizeof(singleCount), 1, haystack); | |
found = 1; | |
break; | |
} else { | |
// skip past counter | |
fseeko(haystack, ftello(haystack)+sizeof(uint16_t), SEEK_SET); | |
wc = 0; | |
continue; | |
} | |
wc = 0; | |
} | |
} else { | |
// this is a fixed length string | |
if (wc == fixedLen-1) { | |
// this is the end of a word | |
// this is the end of a word which is wc characters long, see if it matches str | |
uint64_t cc = 0; | |
uint64_t is = 0; | |
while (cc<wc) { | |
if (word[cc] == str[cc]) { | |
is++; | |
} | |
cc++; | |
} | |
if (is == cc && is == fixedLen-1) { | |
//printf("\tfound existing word\n"); | |
// word is existing, increment counter which will start at c+12+1 | |
//uint16_t singleCount = (haystack[c+12+1]<<8) | (haystack[c+12+2]<<0); | |
uint16_t singleCount; | |
fread(&singleCount, sizeof(singleCount), 1, haystack); | |
//printf("existing count %u\n", singleCount); | |
singleCount++; | |
fseeko(haystack, ftello(haystack)-sizeof(singleCount), SEEK_SET); | |
fwrite(&singleCount, sizeof(singleCount), 1, haystack); | |
found = 1; | |
break; | |
} else { | |
// skip past counter | |
fseeko(haystack, ftello(haystack)+sizeof(uint16_t), SEEK_SET); | |
wc = 0; | |
continue; | |
} | |
wc = 0; | |
} | |
} | |
word[wc] = byte; | |
wc++; | |
c++; | |
} | |
if (found == 0) { | |
fseeko(haystack, 0, SEEK_END); | |
// add string and 2 byte count of 1 to haystack | |
// file pointer would be at eof | |
uint64_t cc = 0; | |
//printf("adding "); | |
if (fixedLen == 0) { | |
while (cc<strlen(str)) { | |
putc(str[cc], haystack); | |
//printf("%c", str[cc]); | |
cc++; | |
} | |
putc('\0', haystack); | |
} else { | |
while (cc<fixedLen) { | |
putc(str[cc], haystack); | |
//printf("%c", str[cc]); | |
cc++; | |
} | |
} | |
//printf(" ending at %lu\n", ftello(haystack)); | |
// add count of 1 | |
uint16_t singleCount = 1; | |
fwrite(&singleCount, sizeof(singleCount), 1, haystack); | |
// increment count | |
count++; | |
fseeko(haystack, 0, SEEK_SET); | |
fwrite(&count, sizeof(count), 1, haystack); | |
// add cc, NUL and word count to len | |
len += cc+1+sizeof(singleCount); | |
if (fixedLen > 0) { | |
// subtract NUL from len | |
len--; | |
} | |
fseeko(haystack, 4, SEEK_SET); | |
fwrite(&len, sizeof(len), 1, haystack); | |
} | |
return haystack; | |
} | |
FILE *haystackRemove(FILE *haystack, char *haystackFilename, char *tempFilename, char *string, uint64_t fixedLen) { | |
// generate a temporary file to rewrite to | |
FILE *temp; | |
temp = fopen(tempFilename, "wb+"); | |
// copy the first 12 bytes from haystack to temp | |
// this never changes in sort, only when you are adding or removing | |
fseeko(haystack, 0, SEEK_SET); | |
uint64_t c = 0; | |
char byte; | |
while (c<12) { | |
byte = getc(haystack); | |
putc(byte, temp); | |
c++; | |
} | |
uint32_t count = haystackCount(haystack); | |
uint64_t len = haystackLen(haystack); | |
//printf("count %u\n", count); | |
// seek haystack to after count and len | |
fseeko(haystack, 12, SEEK_SET); | |
// find word with current greatest count | |
uint64_t cc = 0LLU; | |
uint64_t wc = 0LLU; | |
char word[200]; | |
uint64_t startsAt = 0LLU; | |
uint64_t didRemove = 0LLU; | |
// loop through the haystack | |
while (cc<len) { | |
if (fixedLen == 0) { | |
// need to have this second block because technically a fixed length string could have \0 as a char | |
if (byte == '\0' && cc > 0) { | |
//printf("remove routine found word %s at %lu\n", word, ftello(haystack)-wc); | |
// check if the word is the same as string | |
if (strncmp(word, string, wc) == 0) { | |
// don't add it | |
//printf("\t is matching word, removing it\n"); | |
fseeko(haystack, ftello(haystack)+2, SEEK_SET); | |
didRemove = wc+(uint64_t)2LLU; | |
} else { | |
// add it | |
c = 0; | |
while (c<wc) { | |
putc(word[c], temp); | |
c++; | |
} | |
// add counter, 2 bytes | |
uint16_t localCount; | |
fread(&localCount, sizeof(localCount), 1, haystack); | |
fwrite(&localCount, sizeof(localCount), 1, temp); | |
} | |
wc = 0; | |
cc+= 2; | |
} | |
} else { | |
// this is fixed length remove | |
if (wc == fixedLen) { | |
// check if the word is the same as string | |
if (strncmp(word, string, wc) == 0) { | |
// don't add it | |
fseeko(haystack, ftello(haystack)+2, SEEK_SET); | |
didRemove = wc+(uint64_t)2LLU; | |
} else { | |
// add it | |
c = 0; | |
while (c<wc) { | |
putc(word[c], temp); | |
c++; | |
} | |
// add counter, 2 bytes | |
uint16_t localCount; | |
fread(&localCount, sizeof(localCount), 1, haystack); | |
fwrite(&localCount, sizeof(localCount), 1, temp); | |
} | |
wc = 0; | |
cc+= 2; | |
} | |
} | |
if (wc == 0) { | |
startsAt = ftello(haystack); | |
} | |
byte = getc(haystack); | |
word[wc] = byte; | |
cc++; | |
wc++; | |
} | |
// must erase current haystack | |
fclose(haystack); | |
// open wb+ so it creates an empty file | |
fopen(haystackFilename, "wb+"); | |
if (didRemove > 0) { | |
// set temp count -1 | |
count = haystackCount(temp); | |
count--; | |
fseeko(temp, 0, SEEK_SET); | |
fwrite(&count, 1, sizeof(count), temp); | |
// set temp len -didRemove | |
len = haystackLen(temp); | |
len -= didRemove; | |
fseeko(temp, 4, SEEK_SET); | |
fwrite(&len, 1, sizeof(len), temp); | |
} | |
fseeko(temp, 0, SEEK_END); | |
uint64_t fileSize = ftello(temp); | |
// now write temp to haystack | |
fseeko(temp, 0, SEEK_SET); | |
uint64_t filePos = 0; | |
while (filePos < fileSize) { | |
byte = getc(temp); | |
putc(byte, haystack); | |
filePos++; | |
} | |
// close and remove temp | |
fclose(temp); | |
remove(tempFilename); | |
// then return the modified haystack | |
return haystack; | |
} | |
FILE *haystackSort(FILE *haystack, char *tempFilename, uint64_t fixedLen) { | |
// generate a temporary file to rewrite to | |
FILE *temp; | |
temp = fopen(tempFilename, "wb+"); | |
// go to start of haystack | |
fseeko(haystack, 0, SEEK_SET); | |
// copy the first 12 bytes from haystack to temp | |
// this never changes in sort, only when you are adding or removing | |
uint64_t c = 0; | |
char byte; | |
while (c<12) { | |
byte = getc(haystack); | |
putc(byte, temp); | |
c++; | |
} | |
c = 0; | |
uint64_t currentTempPos = 12; | |
uint32_t count = haystackCount(haystack); | |
uint64_t len = haystackLen(haystack); | |
//printf("count %u\n", count); | |
// do this haystackCount times | |
while (c<count) { | |
// seek haystack to after count and len | |
fseeko(haystack, 12, SEEK_SET); | |
// find word with current greatest count | |
uint64_t cc = 0; | |
uint64_t wc = 0; | |
uint16_t currentGreatestCount = 0; | |
uint64_t greatestCountPosition = 0; | |
uint8_t word[200]; | |
uint64_t startsAt = 0; | |
while (cc<len) { | |
if (fixedLen == 0) { | |
// need to have this second block because technically a fixed length string could have \0 as a char | |
if (byte == '\0' && cc > 0) { | |
// get the count | |
uint16_t singleCount; | |
fread(&singleCount, sizeof(singleCount), 1, haystack); | |
if (singleCount > currentGreatestCount) { | |
currentGreatestCount = singleCount; | |
greatestCountPosition = startsAt; | |
} | |
wc = 0; | |
cc+= 2; | |
} | |
} else { | |
// this is fixed length sort | |
if (wc == fixedLen) { | |
// get the count | |
uint16_t singleCount; | |
fread(&singleCount, sizeof(singleCount), 1, haystack); | |
if (singleCount > currentGreatestCount) { | |
currentGreatestCount = singleCount; | |
greatestCountPosition = startsAt; | |
} | |
wc = 0; | |
cc+= 2; | |
} | |
} | |
if (wc == 0) { | |
startsAt = ftello(haystack); | |
} | |
byte = getc(haystack); | |
word[wc] = byte; | |
cc++; | |
wc++; | |
} | |
//printf("greatestCountPosition %lu\n", greatestCountPosition); | |
//printf("currentGreatestCount %hu\n", currentGreatestCount); | |
// now add the word at greatestCountPosition to temp | |
fseeko(haystack, greatestCountPosition, SEEK_SET); | |
fseeko(temp, currentTempPos, SEEK_SET); | |
uint64_t cr = 0; | |
while (1) { | |
byte = getc(haystack); | |
//printf("%c %u grabbing from %ld, inserting at %lu\n", byte, byte, ftello(haystack)-1, currentTempPos); | |
putc(byte, temp); | |
if (byte == '\0' && fixedLen == 0) { | |
break; | |
} else if (cr == fixedLen-1) { | |
break; | |
} | |
cr++; | |
} | |
//printf("\n"); | |
// set the current greatest word in haystack's count to 0 | |
uint16_t singleCount = 0; | |
//printf("setting count to 0 in haystack at %ld\n", ftello(haystack)); | |
fwrite(&singleCount, sizeof(singleCount), 1, haystack); | |
// then write the currentGreatestCount to temp | |
fwrite(¤tGreatestCount, sizeof(currentGreatestCount), 1, temp); | |
currentTempPos = ftello(temp); | |
c++; | |
} | |
fseeko(temp, 0, SEEK_END); | |
uint64_t fileSize = ftello(temp); | |
// now write temp to haystack | |
fseeko(haystack, 0, SEEK_SET); | |
fseeko(temp, 0, SEEK_SET); | |
uint64_t filePos = 0; | |
while (filePos < fileSize) { | |
byte = getc(temp); | |
putc(byte, haystack); | |
filePos++; | |
} | |
// close and remove temp | |
fclose(temp); | |
remove(tempFilename); | |
// then return the modified haystack | |
return haystack; | |
} | |
FILE *haystackInit(char *filename) { | |
FILE *haystack = fopen(filename, "wb+"); | |
uint64_t c = 0; | |
while (c<12) { | |
putc(0, haystack); | |
c++; | |
} | |
return haystack; | |
} | |
int main() { | |
FILE *haystack; | |
// init the haystack, expects a filename for argument | |
// and returns a file pointer | |
// it's best to use files which exist on a ramdisk | |
haystack = haystackInit("ramdisk/temp"); | |
uint8_t showFixed = 1; | |
if (showFixed == 0) { | |
// add some variable length strings to the haystack | |
// strings added multiple times will have their counter incremented | |
// and can be sorted by that after all the additions are done | |
uint32_t c = 0; | |
while (c<500) { | |
haystack = haystackAdd(haystack, "test", 0); | |
haystack = haystackAdd(haystack, "test", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackAdd(haystack, "test", 0); | |
haystack = haystackAdd(haystack, "new", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackAdd(haystack, "wtf", 0); | |
haystack = haystackRemove(haystack, "ramdisk/temp", "ramdisk/temp1", "test", 0); | |
c++; | |
} | |
haystack = haystackSort(haystack, "ramdisk/temp1", 0); | |
} else { | |
// with fixed length strings, they must all be the same length | |
// any longer will be ignored and the nul character wont be in the final result after each string | |
uint32_t c = 0; | |
while (c<5000) { | |
haystack = haystackAdd(haystack, "3f399feca50bb6be3d8c710d458fb328", 32); | |
haystack = haystackAdd(haystack, "3f6243c885a3eb38d83fbfa1dad04a51", 32); | |
haystack = haystackAdd(haystack, "7f3d2d227c6a18135090f93f643ee4ca", 32); | |
haystack = haystackAdd(haystack, "c390cf342008a6466bcab0d7920eff4c", 32); | |
haystack = haystackAdd(haystack, "c390cf342008a6466bcab0d7920eff4c", 32); | |
haystack = haystackRemove(haystack, "ramdisk/temp", "ramdisk/temp1", "3f399feca50bb6be3d8c710d458fb328", 32); | |
c++; | |
} | |
// sort the haystack by count, args haystack, tempFilename, len | |
haystack = haystackSort(haystack, "ramdisk/temp1", 32); | |
} | |
fseeko(haystack, 0, SEEK_END); | |
uint64_t fileSize = ftello(haystack); | |
fseeko(haystack, 0, SEEK_SET); | |
char byte; | |
uint64_t c = 0; | |
while (c < fileSize) { | |
byte = getc(haystack); | |
printf("%c %u\t pos %lu\t ftello %lu\n", byte, byte, c, ftello(haystack)); | |
c++; | |
} | |
fclose(haystack); | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment