Created
August 7, 2024 14:03
-
-
Save daviddavo/02162ec6793889a47b917adb5a0bbdba to your computer and use it in GitHub Desktop.
Example splitting huge CSV files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This program should split xblock-eth's CSV files into smaller files that fit into | |
memory. | |
The program checks the two numbers in the filename and takes N extra digits. | |
With N=1, it converts a file like 1000to1099_... into 10 files: 1000to1019_..., etc. | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <errno.h> | |
// #include "daverror.h" | |
// Use wc --max-line-length to discover max line length | |
// It returned 296 | |
#define MAX_LINE_LENGTH 512 | |
#define MAX_DIGITS_LENGTH 16 | |
#define swap(x,y) do \ | |
{ unsigned char swap_temp[sizeof(x) == sizeof(y) ? (signed)sizeof(x) : -1]; \ | |
memcpy(swap_temp,&y,sizeof(x)); \ | |
memcpy(&y,&x, sizeof(x)); \ | |
memcpy(&x,swap_temp,sizeof(x)); \ | |
} while(0) | |
#define DAVO_CHECKPTR(n) __davo_checkfile(n, __FILE__, __LINE__) | |
void __davo_checkfile(const void * ptr, const char f[], const int l) { | |
if (ptr == NULL) { | |
fprintf(stderr, "Error: %s (%d)\n", strerror(errno), errno); | |
fprintf(stderr, "@ %s:%d\n", f, l); | |
exit(1); | |
} | |
} | |
void parse_fname(const char * fname, char* from, char* to, char *rest) { | |
// Parse filename in the format XXXXXtoYYYYY_ZZZZZZ | |
enum fsname_fname_parser { | |
FIRST_NUMBER, | |
TO_2, | |
SECOND_NUMBER, | |
REST, | |
} state = FIRST_NUMBER; | |
int j = 0; | |
for (int i = 0; fname[i] != '\0' && i < 1024; i++) { | |
// printf("state: %d, fname[%d]=%c, from[%d]=%c, to[%d]=%c\n", state, i, fname[i], j) | |
switch (state) { | |
case FIRST_NUMBER: | |
if (fname[i] == 't') { | |
state = TO_2; | |
from[j] = '\0'; | |
} else { | |
from[j] = fname[i]; | |
} | |
j++; | |
break; | |
case SECOND_NUMBER: | |
if (fname[i] == '_') { | |
state = REST; | |
to[j] = '\0'; | |
j = 0; | |
} else { | |
to[j] = fname[i]; | |
j++; | |
} | |
break; | |
case TO_2: | |
state = SECOND_NUMBER; | |
j = 0; | |
break; | |
case REST: | |
rest[j] = fname[i]; | |
j++; | |
break; | |
} | |
} | |
rest[j] = '\0'; | |
} | |
int common_prefix_length(const char *str1, const char *str2, const int max_n) { | |
int i = 0; | |
while (str1[i] == str2[i] && i < max_n) i++; | |
return i; | |
} | |
int line_to_fname(const char *line, char *fname, int extra_digits, const char * rest) { | |
char from[MAX_DIGITS_LENGTH]; | |
char to[MAX_DIGITS_LENGTH]; | |
int i; | |
for (i = 0; line[i] != ','; i++) { | |
from[i] = to[i] = line[i]; | |
} | |
from[i] = to[i] = '\0'; | |
for (int j = i - extra_digits; j < i; j++) { | |
to[j] = '9'; | |
} | |
sprintf(fname, "%sto%s_%s", from, to, rest); | |
} | |
int main(int argc, char* argv[]) { | |
FILE * fi = NULL, *fo = NULL; | |
int required_cpl, extra_digits, index, lastindex = -1; | |
char header[MAX_LINE_LENGTH]; | |
char auxbuf1[MAX_LINE_LENGTH] = "\0"; | |
char auxbuf2[MAX_LINE_LENGTH] = "\0"; | |
char * line = auxbuf1, * lastline = auxbuf2; | |
char from[MAX_DIGITS_LENGTH], to[MAX_DIGITS_LENGTH], rest[MAX_LINE_LENGTH]; | |
if (argc < 2) { | |
fprintf(stderr, "Usage: %s filename [extra_digits=1]\n", argv[0]); | |
return EXIT_FAILURE; | |
} | |
parse_fname(argv[1], from, to, rest); | |
extra_digits = (argc==3)?atoi(argv[2]):1; | |
required_cpl = extra_digits + common_prefix_length(from, to, MAX_DIGITS_LENGTH); | |
printf("from: %s, to: %s, rest: %s, cpl: %d, rcpl: %d\n", from, to, rest, common_prefix_length(from, to, MAX_DIGITS_LENGTH), required_cpl); | |
DAVO_CHECKPTR(fi = fopen(argv[1], "r")); | |
// Save header to variable | |
DAVO_CHECKPTR(fgets(header, MAX_LINE_LENGTH, fi)); | |
printf("Saved header: %s", header); | |
while (fgets(line, MAX_LINE_LENGTH, fi) != NULL) { | |
// If first line or should split | |
if (lastline[0] == '\0' || (lastline[0] != '\0' && common_prefix_length(line, lastline, required_cpl) < required_cpl)) { | |
char output_fname[MAX_LINE_LENGTH]; | |
if (fo != NULL) fclose(fo); | |
// Open new file and write header | |
line_to_fname(line, output_fname, extra_digits, rest); | |
printf("Split to %s\n", output_fname); | |
DAVO_CHECKPTR(fo = fopen(output_fname, "w")); | |
// Write header | |
fputs(header, fo); | |
} | |
fputs(line, fo); | |
swap(line, lastline); | |
} | |
fclose(fi); | |
fclose(fo); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment