Last active
July 3, 2022 15:09
-
-
Save wernsey/721865a68d15e32281ffb6e295bc54b8 to your computer and use it in GitHub Desktop.
CSV stream reader - single header library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <errno.h> | |
#define CSV_STATIC | |
#define CSV_IMPLEMENTATION | |
#include "csvstrm.h" | |
int main(int argc, char *argv[]) { | |
CsvContext csv; | |
FILE *f; | |
if(argc < 2) { | |
fprintf(stderr, "CSV file expected\n"); | |
return 1; | |
} | |
f = fopen(argv[1], "r"); | |
if(!f) { | |
fprintf(stderr, "Unable to open '%s': %s\n", argv[1], strerror(errno)); | |
return 1; | |
} | |
#if 1 | |
csv_context_file(&csv, f); | |
#else | |
struct csv_read_limit ll; | |
ll.f = f; | |
ll.limit = 41; | |
csv_context_file_limit(&csv, &ll); | |
#endif | |
while(csv_read_record(&csv)) { | |
int j; | |
if(csv_get_error(&csv) != CSV_OK) { | |
fprintf(stderr, "error: %d\n", csv_get_error(&csv)); | |
break; | |
} | |
for(j = 0; j < csv_count(&csv); j++) { | |
printf("[%s]", csv_field(&csv,j)); | |
} | |
printf("\n"); | |
} | |
fclose(f); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef CSV_STREAM_H | |
#ifdef __cplusplus | |
extern "C" { | |
#endif | |
/** | |
* **CSV stream reader** | |
* | |
* Single header library to read a [CSV file][wiki] row-by-row. | |
* | |
* To use this library, define `CSV_IMPLEMENTATION` before including | |
* **csvstrm.h** in _one_ of your C files (other C files include | |
* **csvstrm.h** normally), like so: | |
* | |
* ```c | |
* #include <stdio.h> | |
* | |
* #define CSV_IMPLEMENTATION | |
* #include "csvstrm.h" | |
* ``` | |
* | |
* It will parse CSV documents as specified by [RFC4180][RFC], but it | |
* follows the rule of _"be liberal in what you accept from others"_, | |
* so there are a couple of deviations: | |
* | |
* * Leading and trailing whitespaces in each field are trimmed by default. | |
* * This behaviour can be changed by defining `CSV_TRIM` as 0. | |
* * You can have spaces before and after the quotes in a quoted field. | |
* * Double quotes inside unquoted fields are allowed. | |
* * Records can end with CRLF or with LF character sequences. | |
* * It does not enforce that all records (rows) have the same number of | |
* fields. That is an application concern. | |
* * It does not specify whether the first row contains headers. That is | |
* left up to the application. | |
* | |
* It also took some ideas from the [Repici][] document cited by the [RFC][]. | |
* | |
* The documentation for this file is generated by extracting the comments | |
* into a [Markdeep][] document. Here is an [Awk script][doc-script] that | |
* does this. | |
* | |
* # Basic usage example | |
* | |
* Here is a simple usage example. Some error handling code has been omitted. | |
* | |
* ```c | |
* int j; | |
* CsvContext csv; | |
* FILE *f = fopen(argv[1], "r"); | |
* | |
* // Call csv_context_file() to initialise the CsvContext object | |
* // to read read CSV data from an open file. | |
* csv_context_file(&csv, f); | |
* | |
* // csv_read_record() reads a row from the file. | |
* // It will return 0 when it reaches the end of the file | |
* while(csv_read_record(&csv)) { | |
* // You can use csv_count() to retrieve the number of fields | |
* // read from the file. | |
* // csv_field() can then be used to access an individual field. | |
* for(j = 0; j < csv_count(&csv); j++) { | |
* printf("[%s]", csv_field(&csv,j)); | |
* } | |
* printf("\n"); | |
* } | |
* fclose(f); | |
* ``` | |
* | |
* # License | |
* | |
* This code is dedicated to the public domain by the author, Werner Stoop. | |
* | |
* If, for some reason, you cannot use the above public domain dedication | |
* then [the following license][fsfap] can be applied: | |
* | |
* ```txt | |
* (c) 2022 Werner Stoop | |
* Copying and distribution of this file, with or without modification, | |
* are permitted in any medium without royalty provided the copyright | |
* notice and this notice are preserved. This file is offered as-is, | |
* without any warranty. | |
* ``` | |
* | |
* [wiki]: https://en.wikipedia.org/wiki/Comma-separated_values | |
* [RFC]: https://datatracker.ietf.org/doc/html/rfc4180 | |
* [Repici]: https://www.creativyst.com/Doc/Articles/CSV/CSV01.shtml | |
* [fsfap]: https://en.wikipedia.org/wiki/GNU_All-permissive_License | |
* [Markdeep]: https://casual-effects.com/markdeep/ | |
* [doc-script]: https://gist.github.com/wernsey/de253d42a8df6f3b21358e4b5422b955 | |
*/ | |
/** | |
* # Configuration | |
* | |
* These macros can be defined before including **csvstrm.h** in your | |
* C file to control the behaviour of the library. | |
* | |
* `CSV_DELIMITER` | |
* : The delimiter to separate fields (columns) in each record (row) | |
* It defaults to `','`. | |
* `CSV_BUFFER_SIZE` | |
* : While each record is being read, the characters from the file | |
* are copied to an internal buffer. This controls the size of | |
* that internal buffer. | |
* `CSV_READ_BUFFER_SIZE` | |
* : This controls the size of the second internal buffer that | |
* stores raw bytes as they are read from the input before they're | |
* processed. | |
* `CSV_MAX_FIELDS` | |
* : The maximum number of fields expected per record. | |
* `CSV_TRIM` | |
* : Determines whether leading and trailing whitespace characters will | |
* be trimmed from fields by the parser. | |
* For example, consider a CSV section `..., foo ,...`. If `CSV_TRIM` is | |
* non-zero the field will be returned as `"foo"`. It it is 0 then the | |
* whitespace will be left intact, so it will be returned as `" foo "` | |
* | |
* These macros _must_ be the same in all files that includes **csvstrm.h**. | |
*/ | |
# ifndef CSV_DELIMITER | |
# define CSV_DELIMITER ',' | |
# endif | |
# ifndef CSV_BUFFER_SIZE | |
# define CSV_BUFFER_SIZE 256 | |
# endif | |
# ifndef CSV_READ_BUFFER_SIZE | |
# define CSV_READ_BUFFER_SIZE 64 | |
# endif | |
# ifndef CSV_MAX_FIELDS | |
# define CSV_MAX_FIELDS 32 | |
# endif | |
# ifndef CSV_TRIM | |
# define CSV_TRIM 1 | |
# endif | |
/** | |
* # Definitions | |
* | |
* ## `csv_read_data_fun` | |
* | |
* `typedef int (*csv_read_data_fun)(char *b, int n, void *d);` | |
* | |
* Prototype for functions that can read CSV data. | |
* | |
* `b` is a pointer to a buffer that will be filled with chars from | |
* the input files. `n` contains the size in bytes of the buffer. | |
* `d` is a pointer to some structure where the data is read from. | |
* | |
* For example, when reading a CSV file from a ZIP archive, `d` might | |
* point to the structure that the ZIP library to encapsulate the archive. | |
* | |
* The function should return 0 if it reaches the end of the input data, | |
* non-zero otherwise. | |
* | |
* See `csv_context_custom()` in Section [initialising the csvcontext] | |
*/ | |
typedef int (*csv_read_data_fun)(char *b, int n, void *d); | |
/** | |
* ## `enum csv_error_code` | |
* | |
* `CSV_OK` | |
* : No error | |
* `CSV_ERR_BUFFER` | |
* : The buffer used to store field data internally is full. | |
* It is too small for the record (row) you're reading. | |
* Increase `CSV_BUFFER_SIZE`. | |
* `CSV_ERR_FIELDS` | |
* : There are too many fields (columns) in the record. | |
* Increase `CSV_MAX_FIELDS`. | |
* `CSV_ERR_BAD_QUOTE` | |
* : A quoted field is incorrectly formatted. | |
* `CSV_ERR_LINE_END` | |
* : There is a problem with a line ending. | |
* | |
*/ | |
enum csv_error_code { | |
CSV_OK = 0, | |
CSV_ERR_BUFFER, /* increase CSV_BUFFER_SIZE */ | |
CSV_ERR_FIELDS, /* increase CSV_MAX_FIELDS */ | |
CSV_ERR_BAD_QUOTE, | |
CSV_ERR_LINE_END, | |
}; | |
/** | |
* ## `typedef struct CsvContext CsvContext;` | |
* | |
* Structure that contains the state of the CSV stream parser. | |
* | |
* The fields in the structure should not be manipulated directly, | |
* but these are some members of interest: | |
* | |
* `char *fields[CSV_MAX_FIELDS]` | |
* : The array of pointers that contain the fields after parsing a record. | |
* Rather use `csv_field()` to access the individual fields. | |
* `int nf` | |
* : The number of fields parsed from a record. | |
* Rather use `csv_count()` to read this value. | |
* `enum csv_error_code err` | |
* : An error code that may have resulted from parsing the record. | |
* Rather use `csv_get_error()` to retrieve this value. | |
* | |
* Section [initialising the csvcontext] below describes how to | |
* initialise the structure to read CSV data. | |
*/ | |
typedef struct CsvContext { | |
/* Determines where the data is read from */ | |
csv_read_data_fun get_data; | |
void *data; | |
/* The internal buffer, where bytes are read into | |
from the file, but before they're processed. */ | |
char raw_buffer[CSV_READ_BUFFER_SIZE]; | |
int in_pos; | |
int last_char; | |
/* Where the data for the fields are stored. | |
The values in `fields` are a pointers into this buffer */ | |
char buffer[CSV_BUFFER_SIZE]; | |
/* The fields that have been parsed from the file */ | |
char *fields[CSV_MAX_FIELDS]; | |
int nf; | |
/* Error code? */ | |
enum csv_error_code err; | |
} CsvContext; | |
#ifdef EOF /* EOF will be defined if <stdio.h> is #included */ | |
/** | |
* # Initialising the `CsvContext` | |
* | |
* `void csv_context_file(CsvContext *csv, FILE *file)` | |
* | |
* Initialises a `CsvContext` structure to read data from a file | |
* pointed to by `file`. | |
*/ | |
void csv_context_file(CsvContext *csv, FILE *file); | |
/** | |
* `void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll)` | |
* | |
* Initialises a `CsvContext` structure to read data from a file, but it will | |
* only read a limited number of bytes from the file. | |
* | |
* (The intended use-case is where a CSV file has been concatenated with other | |
* files into an archive file) | |
* | |
* The `csv_read_limit` structure is defined as follows: | |
* | |
* ``` | |
* struct csv_read_limit { | |
* FILE *f; | |
* int limit; | |
* }; | |
* ``` | |
* | |
* where `f` is the file to read from and `limit` is the maximum | |
* number of bytes that will be read from a file. | |
*/ | |
struct csv_read_limit { | |
FILE *f; | |
int limit; | |
}; | |
void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll); | |
#endif | |
/** | |
* `void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data)` | |
* | |
* Initialises a `CsvContext` with a custom function `fun` that will read bytes | |
* from an object `data`. | |
*/ | |
void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data); | |
/** | |
* # Reading records | |
* | |
* `int csv_read_record(CsvContext *csv)` | |
* | |
* Reads a record from the CSV file. | |
* | |
* It returns the number of fields that were read from the record. | |
* If the number of fields does not match the number of fields expected | |
* then `csv_get_error()` can be used to retrieve the error code. | |
* | |
* `int csv_count(CsvContext *csv)` | |
* | |
* Get the number of fields in the last record that was read by | |
* `csv_read_record()`. | |
* | |
* `const char *csv_field(CsvContext *csv, int i)` | |
* | |
* Get the `i`'th field of the last record that was read by | |
* `csv_read_record()`. | |
* | |
* `enum csv_error_code csv_get_error(CsvContext *csv)` | |
* | |
* Retrieves an error code (if any) from the `CsvContext`. | |
* The error codes are described in Subsection [enum csv_error_code]. | |
*/ | |
int csv_read_record(CsvContext *csv); | |
int csv_count(CsvContext *csv); | |
const char *csv_field(CsvContext *csv, int i); | |
enum csv_error_code csv_get_error(CsvContext *csv); | |
/* *********************************************************************** */ | |
# ifdef CSV_IMPLEMENTATION | |
#include <stdio.h> | |
#include <string.h> | |
#include <ctype.h> | |
#include <assert.h> | |
#ifdef __cplusplus | |
# define CAST(x, y) (x)y | |
#else | |
# define CAST(x, y) y | |
#endif | |
static int get_char(CsvContext *csv) { | |
char c = 0; | |
if(csv->last_char == EOF) { | |
return EOF; | |
} else if(csv->last_char) { | |
c = csv->last_char; | |
csv->last_char = 0; | |
return c; | |
} | |
if(csv->in_pos >= CSV_READ_BUFFER_SIZE || (c = csv->raw_buffer[csv->in_pos++]) == '\0') { | |
int cnt = csv->get_data(csv->raw_buffer, CSV_READ_BUFFER_SIZE, csv->data); | |
if(!cnt) { | |
csv->last_char = EOF; | |
return EOF; | |
} | |
csv->in_pos = 0; | |
c = csv->raw_buffer[csv->in_pos++]; | |
} | |
return c; | |
} | |
static void unget_char(CsvContext *csv, int c) { | |
csv->last_char = c; | |
} | |
int csv_read_record(CsvContext *csv) { | |
int c = 0; | |
size_t start, bump = 0; | |
enum parse_state { | |
RECORD_START, FIELD_START, FIELD, QUOTE, FIELD_END, RECORD_END | |
} state = RECORD_START; | |
if(csv->last_char == EOF) return 0; | |
csv->nf = 0; | |
csv->err = CSV_OK; | |
for(;;) { | |
switch(state) { | |
case RECORD_START: | |
c = get_char(csv); | |
if(c == EOF) | |
return 0; | |
state = FIELD_START; | |
unget_char(csv, c); | |
break; | |
case FIELD_START: | |
if(csv->nf == CSV_MAX_FIELDS) { | |
csv->err = CSV_ERR_FIELDS; | |
return csv->nf; | |
} | |
c = get_char(csv); | |
#if CSV_TRIM | |
while(strchr(" \t\v\f", c)) | |
c = get_char(csv); | |
#endif | |
csv->fields[csv->nf] = &csv->buffer[bump]; | |
if(c == '\"') | |
state = QUOTE; | |
else { | |
unget_char(csv, c); | |
start = bump; | |
state = FIELD; | |
} | |
break; | |
case FIELD: | |
c = get_char(csv); | |
if(c == '\r') { | |
c = get_char(csv); | |
if(c != '\n') { | |
csv->err = CSV_ERR_LINE_END; | |
return csv->nf; | |
} | |
} | |
if(c == EOF || c == '\n' || c == CSV_DELIMITER) { | |
#if CSV_TRIM | |
while(bump > start && strchr(" \t\v\f", csv->buffer[bump-1])) | |
bump--; | |
#endif | |
state = c == CSV_DELIMITER ? FIELD_END : RECORD_END; | |
} else { | |
if(bump == CSV_BUFFER_SIZE - 1) { | |
csv->err = CSV_ERR_BUFFER; | |
return csv->nf; | |
} | |
csv->buffer[bump++] = c; | |
} | |
break; | |
case QUOTE: | |
c = get_char(csv); | |
if(c == EOF) { | |
csv->err = CSV_ERR_BAD_QUOTE; | |
return csv->nf; | |
} | |
if(c == '\"') { | |
c = get_char(csv); | |
if(c != '\"') { | |
#if CSV_TRIM | |
while(strchr(" \t\v\f", c)) | |
c = get_char(csv); | |
#endif | |
if(c == EOF || c == '\n') { | |
state = RECORD_END; | |
} else if(c == CSV_DELIMITER) { | |
state = FIELD_END; | |
} else { | |
csv->err = CSV_ERR_BAD_QUOTE; | |
return csv->nf; | |
} | |
break; | |
} | |
} | |
if(bump == CSV_BUFFER_SIZE - 1) { | |
csv->err = CSV_ERR_BUFFER; | |
return csv->nf; | |
} | |
csv->buffer[bump++] = c; | |
break; | |
case FIELD_END: | |
case RECORD_END: | |
if(bump == CSV_BUFFER_SIZE - 1) { | |
csv->err = CSV_ERR_BUFFER; | |
return csv->nf; | |
} | |
csv->buffer[bump++] = '\0'; | |
csv->nf++; | |
if(state == RECORD_END) | |
return csv->nf; | |
else | |
state = FIELD_START; | |
break; | |
} | |
} | |
/*return 0;*/ | |
} | |
void csv_context_custom(CsvContext *csv, csv_read_data_fun fun, void *data) { | |
csv->get_data = fun; | |
csv->data = data; | |
csv->last_char = 0; | |
csv->in_pos = CSV_READ_BUFFER_SIZE; | |
csv->nf = 0; | |
csv->err = CSV_OK; | |
} | |
static int file_input_get_line(char *str, int num, void *data) { | |
size_t read; | |
FILE *file = CAST(FILE*, data); | |
if(feof(file)) | |
return 0; | |
read = fread(str, 1, num-1, file); | |
str[read] = '\0'; | |
if(!read) | |
return 0; | |
return 1; | |
} | |
void csv_context_file(CsvContext *csv, FILE *file) { | |
assert(file); | |
csv_context_custom(csv, file_input_get_line, file); | |
} | |
static int file_input_get_line_limit(char *str, int num, void *data) { | |
size_t read; | |
struct csv_read_limit *ll = CAST(struct csv_read_limit *, data); | |
if(!ll->limit) return 0; | |
num--; | |
if(num > ll->limit) | |
num = ll->limit; | |
read = fread(str, 1, num, ll->f); | |
str[read] = '\0'; | |
if(!read) | |
return 0; | |
ll->limit -= strlen(str); | |
return 1; | |
} | |
void csv_context_file_limit(CsvContext *csv, struct csv_read_limit *ll) { | |
assert(ll->f); | |
assert(ll->limit > 0); | |
csv_context_custom(csv, file_input_get_line_limit, ll); | |
} | |
int csv_count(CsvContext *csv) { | |
return csv->nf; | |
} | |
const char *csv_field(CsvContext *csv, int i) { | |
if(i < 0 || i >= csv->nf) return ""; | |
return csv->fields[i]; | |
} | |
enum csv_error_code csv_get_error(CsvContext *csv) { | |
return csv->err; | |
} | |
# endif /* CSV_IMPLEMENTATION */ | |
#ifdef __cplusplus | |
} /* extern "C" */ | |
#endif | |
#endif /* CSV_STREAM_H */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment