Skip to content

Instantly share code, notes, and snippets.

@Wollw
Created March 19, 2012 04:11
Show Gist options
  • Save Wollw/2094483 to your computer and use it in GitHub Desktop.
Save Wollw/2094483 to your computer and use it in GitHub Desktop.
/*
* This is a parser intended for Common Log Format access logs.
* It splits on whitespace unless it is part of a field delimited by
* square brackets or double quotes.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "parser.h"
#define BUFFER_SIZE 5000
int main(int argc, char *argv[]) {
if (argc < 2) {
printf("Please provide a file name.\n");
return 1;
}
FILE *fp = fopen(argv[1], "r");
if (fp == NULL) {
perror("Failed to open file.");
return EXIT_FAILURE;
}
char *line = malloc(BUFFER_SIZE*sizeof(char));
assert(line);
while (fgets(line, BUFFER_SIZE, fp) != NULL) {
line[strlen(line) - 1] = '\0';
char **toks = parse_line(line);
int i = 0;
while (toks[i] != NULL) {
printf("%s\n", toks[i++]);
}
parse_free(toks);
}
free(line);
fclose(fp);
return EXIT_SUCCESS;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "parser.h"
char * _parse_append_char(char *s, char c) {
int l = strlen(s);
s = realloc(s, (l + 2) * sizeof(char));
assert("NULL Pointer." && s);
s[l] = c;
s[l+1] = '\0';
return s;
}
void parse_free(char **p) {
unsigned int i = 0;
while (p[i] != NULL)
free(p[i++]);
free(p);
}
char ** parse_line(char line[]) {
char **tokens = NULL;
int tok_n = -1;
unsigned int flags = 0x00;
char *c;
char end_delim = '\0';
for (c = line; *c != '\0'; c++) {
if (!(flags & _f_delimited_tok) && !(flags & _f_bare_tok)
&& *c != '\t' && *c != ' ') {
flags = *c == '"' || *c == '[' ? _f_delimited_tok : _f_bare_tok;
tokens = (char**)realloc(tokens, (++tok_n+1) * sizeof(char*));
assert("NULL Pointer." && tokens);
tokens[tok_n] = calloc(1, sizeof(char));
assert("NULL Pointer." && tokens[tok_n]);
if (flags == _f_delimited_tok) {
end_delim = *c == '"' ? '"' : ']';
continue;
}
}
if (flags & _f_delimited_tok) {
if (*c == '\\' && !(flags & _f_escaped)) {
flags |= _f_escaped;
}
else if ( (flags & _f_escaped) || (*c != end_delim && *c != '\\')) {
tokens[tok_n] = _parse_append_char(tokens[tok_n], *c);
flags &= ~_f_escaped;
} else
flags &= ~_f_delimited_tok;
} else if (flags & _f_bare_tok) {
if (*c == '\t' || *c == ' ')
flags &= ~_f_bare_tok;
else {
tokens[tok_n] = _parse_append_char(tokens[tok_n], *c);
}
}
}
tokens = (char**)realloc(tokens, (++tok_n+1) * sizeof(char*));
assert("NULL Pointer." && tokens);
tokens[tok_n] = NULL;
return tokens;
}
#ifndef PARSER_H
#define PARSER_H
/* flags for keeping track of the parser's state */
enum {
_f_delimited_tok = 1,
_f_bare_tok = 2,
_f_escaped = 4
};
/* Parses a line into tokens. It allocates its own memory. */
char ** parse_line(char l[]);
/* Frees the parsed tokens */
void parse_free(char **data);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment