Skip to content

Instantly share code, notes, and snippets.

@tomthorogood
Last active December 16, 2015 00:09
Show Gist options
  • Save tomthorogood/5345311 to your computer and use it in GitHub Desktop.
Save tomthorogood/5345311 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <string>
#define PACKING_DELIMITER 999999
void tokenize(const char* input)
{
std::vector<int> __tokens; //debug
const char* inputIndex = input;
while(*inputIndex != '\0')
{
if(isdigit(*inputIndex))
{
//delimit the digit by finding the next whitespace
const char* endTokenIndex = strchr(inputIndex, ' ');
//...or the null char, if we're at the end of string
if(!endTokenIndex)
endTokenIndex = strchr(inputIndex, '\0');
//copy the digit to a temp buffer to convert to int
int tokenLength = endTokenIndex - inputIndex + 1;
char token[tokenLength];
memcpy(token, inputIndex, tokenLength-1);
//make sure to add in a terminating char!
token[tokenLength-1] = '\0';
__tokens.push_back(atoi(token));
//move our index to the appropriate spot
if(!(*endTokenIndex) == '\0')
inputIndex = endTokenIndex + 1;
else
inputIndex = endTokenIndex;
}
else
{
//we have some sort of string; put in our special symbol
__tokens.push_back(PACKING_DELIMITER);
//delimit our string by locating the last char before
//whitespace
const char* endTokenIndex;
if(*inputIndex == '"')
endTokenIndex = strchr(inputIndex+1, '"');
else
{
endTokenIndex = strchr(inputIndex, ' ') - 1;
if(!endTokenIndex)
endTokenIndex = strchr(inputIndex, '\0');
}
//push the number of characters in the string on to our
//tokens vector
__tokens.push_back(endTokenIndex - inputIndex + 1);
//pack characters into ints by bitshifting the characters
//as we see them, pushing the resultant int on to our
//vector whenever it's full
unsigned int packedInt = 0;
int packedIntIndex = 0;
while(inputIndex <= endTokenIndex)
{
packedInt |= *inputIndex << (8 * packedIntIndex);
inputIndex++;
if(packedIntIndex == 3)
{
__tokens.push_back(packedInt);
packedIntIndex = 0;
packedInt = 0;
continue;
}
packedIntIndex++;
}
//if we didn't end with a fully packed integer, make sure
//to send it anyway
if(packedIntIndex != 0)
__tokens.push_back(packedInt);
inputIndex += 1;
}
}
//debug & testing
std::cout << std::endl << "===INTERNAL TOKENS===" << std::endl;
for(int i=0; i<__tokens.size(); i++)
std::cout << __tokens[i] << std::endl;
std::cout << std::endl << "===INTERPRETED TOKENS===" << std::endl;
for(int i=0; i<__tokens.size(); i++)
{
if(__tokens[i] == PACKING_DELIMITER)
{
int tokenLength = __tokens[i+1];
for(int j=0; j<((tokenLength+3)/4); j++)
for(int k=0; k<4; k++)
{
char unpackedChar = (char) ( __tokens[i+j+2] >> (8*k));
if(unpackedChar != 0)
std::cout << unpackedChar;
}
std::cout << std::endl;
i += (tokenLength+3) / 4 + 1;
}
else
std::cout << __tokens[i] << std::endl;
}
}
int main()
{
const char test1[] = "42 h✓y hቘha \"this is a string literal\" 56 1 92 5291";
std::cout << "INPUT STRING: " << test1 << std::endl;
tokenize(test1);
return 0;
//output below:
/*
INPUT STRING: 42 h✓y hቘha "this is a string literal" 56 1 92 5291
===INTERNAL TOKENS===
42
999999
5
-7576
121
999999
6
-7832
24936
999999
26
1768453154
1936269427
1931501856
1852404340
1768693863
1634887028
8812
56
1
92
5291
===INTERPRETED TOKENS===
42
h���y
h���ha
"this is a string literal"
56
1
92
5291
*/
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment