Created
May 7, 2015 04:49
-
-
Save lachesis/39cf67044f651ae24bcc to your computer and use it in GitHub Desktop.
Naive implementation of CSV parsing using nested strtok and a mmap'd file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- a/Makefile 2010-08-23 08:46:24.000000000 -0700 | |
+++ b/Makefile 2015-05-06 21:20:20.177910661 -0700 | |
@@ -100,15 +100,15 @@ | |
################ | |
## CHANGE NAME OF ANSI COMPILER HERE | |
################ | |
-CC = | |
+CC = gcc | |
# Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata) | |
# SQLSERVER, SYBASE, ORACLE | |
# Current values for MACHINE are: ATT, DOS, HP, IBM, ICL, MVS, | |
# SGI, SUN, U2200, VMS, LINUX, WIN32 | |
# Current values for WORKLOAD are: TPCH | |
-DATABASE= | |
-MACHINE = | |
-WORKLOAD = | |
+DATABASE= ORACLE | |
+MACHINE = LINUX | |
+WORKLOAD = TPCH | |
# | |
CFLAGS = -g -DDBNAME=\"dss\" -D$(MACHINE) -D$(DATABASE) -D$(WORKLOAD) -DRNG_TEST -D_FILE_OFFSET_BITS=64 | |
LDFLAGS = -O |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Naive implementation of CSV parsing using nested strtok and a mmap'd file. | |
To compile, save as "strtok.c" and run "gcc -O3 -o strtok strtok.c" | |
To generate the test data file, follow these instructions: | |
1. Fetch the TPCH tools from "http://www.tpc.org/tpch/spec/tpch_2_12_0_b5.zip" | |
2. Unzip them (carefully - they have no inner directory) | |
3. Copy "makefile.suite" to "Makefile" | |
4. Apply the patch given below to the "Makefile" | |
5. Run make to build dbgen | |
6. Generate the lineitem.tbl file: ./dbgen -T L | |
The output file should be 759,863,287 bytes. Update the LENGTH in this file's source if it's different. | |
Runs in 2.267 seconds (when writing to /dev/null) on my machine. You don't have to use a GPU or write hundreds of lines of code to get good performance. | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <sys/mman.h> | |
#include <fcntl.h> | |
#define LENGTH 759863287 | |
int main(int argc, char *argv[]) | |
{ | |
char *addr; | |
int fd; | |
char *str1, *str2, *token, *subtoken; | |
char *saveptr1, *saveptr2; | |
// Open and map the file | |
fd = open("lineitem.tbl", O_RDONLY); | |
addr = mmap(NULL, LENGTH, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); | |
// Two-level strtok - newlines, then | | |
for (str1 = addr; ; str1 = NULL) { | |
token = strtok_r(str1, "\n", &saveptr1); | |
if (token == NULL) | |
break; | |
for (str2 = token; ; str2 = NULL) { | |
subtoken = strtok_r(str2, "|", &saveptr2); | |
if (subtoken == NULL) | |
break; | |
// Process the tokens here | |
printf("%s\n", subtoken); | |
//printf(" --> %s\n", subtoken); | |
} | |
} | |
exit(EXIT_SUCCESS); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment