Last active
April 22, 2024 11:37
-
-
Save mofosyne/81c94740c0f33259606afa823562914c to your computer and use it in GitHub Desktop.
alternative to strtok but with escaped character support for deliminating a string with a single char (e.g. csv or psv)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
char *strtok_escaped(char *str, const char *delim) { | |
// Tokenise a string from a single char deliminator | |
// (strtok can deal with a deliminator string but for my purpose of splitting a psv table I only need one char) | |
// (There are better ways to optimise this, but just wanted something to work for now) | |
// https://gist.github.com/mofosyne/81c94740c0f33259606afa823562914c | |
static char *last_token_end = NULL; | |
if (str == NULL && last_token_end == NULL) | |
return NULL; | |
char *token_start = (str != NULL) ? str : last_token_end + 1; | |
char *token_end = token_start; | |
while (*token_end != '\0') { | |
if (*token_end == '\\' && *(token_end + 1) == delim[0]) { | |
// Handle escaped deliminator | |
memmove(token_end, token_end + 1, strlen(token_end + 1) + 1); | |
token_end++; | |
} else if (strchr(delim, *token_end) != NULL) { | |
// Found delimiter | |
*token_end = '\0'; | |
last_token_end = token_end; | |
return token_start; | |
} | |
token_end++; | |
} | |
if (*token_start == '\0') | |
return NULL; | |
last_token_end = token_end - 1; | |
return token_start; | |
} | |
int main() { | |
// Define test cases | |
typedef struct { | |
char *input; | |
char *expected_tokens[10]; // Maximum of 10 tokens per test case | |
} Test; | |
char *delim = "|"; | |
Test tests[] = { | |
{"apple|banana|cherry", {"apple", "banana", "cherry", NULL}}, | |
{"apple\\|banana|cherry", {"apple|banana", "cherry", NULL}}, | |
{"apple|banana|cherry\\", {"apple", "banana", "cherry\\", NULL}}, | |
{"apple\\|banana|cherry\\", {"apple|banana", "cherry\\", NULL}}, | |
{"apple|banana\\|cherry", {"apple", "banana|cherry", NULL}}, | |
{"apple\\|banana\\|cherry", {"apple|banana|cherry", NULL}}, | |
{"apple\\\\|banana|cherry", {"apple\\|banana", "cherry", NULL}}, | |
{"apple\\|banana\\|cherry\\", {"apple|banana|cherry\\", NULL}}, | |
{"apple|ba\\nana|cherry", {"apple", "ba\\nana", "cherry", NULL}}, | |
{"", {NULL}} // Empty string test case | |
}; | |
int num_tests = sizeof(tests) / sizeof(tests[0]); | |
int failed_tests = 0; | |
// Iterate through each test case | |
for (int i = 0; i < num_tests; i++) { | |
printf("\nTest Case %d: '%s'\n", i, tests[i].input); | |
int failed_sub_tests = 0; | |
// Copy input string to a buffer for tokenization | |
char str[512]; | |
strcpy(str, tests[i].input); | |
// Tokenize the string and compare tokens with expected tokens | |
int token_index = 0; | |
char *token = strtok_escaped(str, delim); | |
while (token != NULL && tests[i].expected_tokens[token_index] != NULL) { | |
if (strcmp(token, tests[i].expected_tokens[token_index]) != 0) { | |
printf("Token Mismatch - got '%s' but expecting '%s' - failed\n", token, tests[i].expected_tokens[token_index]); | |
failed_sub_tests++; | |
} else { | |
printf("Token: '%s' - ok\n", token); | |
} | |
token_index++; | |
token = strtok_escaped(NULL, delim); | |
} | |
if (tests[i].expected_tokens[token_index] != NULL) { | |
printf("Incorrect number of tokens\n"); | |
failed_sub_tests++; | |
} | |
if (failed_sub_tests) { | |
failed_tests++; | |
printf("FAILED\n"); | |
}else { | |
printf("PASSED\n"); | |
} | |
} | |
if (failed_tests > 0) { | |
printf("\n%d test(s) failed.\n", failed_tests); | |
return 1; | |
} else { | |
printf("\nAll tests passed.\n"); | |
return 0; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
When running the above you will get something like below
Additional note that I do not escape any other characters like \n etc... because in the application (specifically https://github.com/psv-format/psv.c ) I was intending to simply copy over the resultant token to a json data field.