Skip to content

Instantly share code, notes, and snippets.

@UnkindPartition
Created March 19, 2016 13:48
Show Gist options
  • Save UnkindPartition/4d2d7effb72b5077e2e5 to your computer and use it in GitHub Desktop.
Save UnkindPartition/4d2d7effb72b5077e2e5 to your computer and use it in GitHub Desktop.
Transform combined http log into an R-compatible CSV file
#!/usr/bin/perl
# (c) Roman Cheplyaka, 2016
# License: MIT
# Usage:
#
# combined_to_csv FILE1 FILE2 ... > FILE.csv
#
# The resulting CSV file can be read in R with:
#
# read.csv('FILE.csv',colClasses=c('factor','factor','POSIXct','factor','factor','factor','integer','integer','factor','factor'))
use warnings;
use strict;
use Time::Piece;
use URI::URL;
$, = ",";
$\ = "\n";
print ("address","user","time","method","request","proto","status","bytes","referrer","agent","ref_host");
# see http://nginx.org/en/docs/http/ngx_http_log_module.html
my $regex = qr@^
(\S+) # remote_addr
\s \S+ # -
\s (\S+) # remote_user
\s \[([^\]]+)\] # time_local
\s "([A-Z]+)? # method
\s* ([^"?]*?) (?: \? \S*)? # url path, without ?foo=bar query params
\s* (HTTP/[\d.]+)?" # protocol
\s (\d+) # status
\s (\d+) # body_bytes_sent
\s "([^"]*)" # http_referer
\s "([^"]*)" # http_user_agent
$@x;
while (<>) {
my @fields = /$regex/g or warn "Could not parse: $_" and next;
$fields[2] = Time::Piece->strptime($fields[2], '%d/%b/%Y:%H:%M:%S %z')->strftime('%Y-%m-%d %H:%M:%S');
my $ref_url = url $fields[8];
push @fields, ($ref_url->can('host') ? $ref_url->host : '-');
foreach (@fields) {
if (not defined $_ or $_ eq "-") {
$_ = 'NA';
next;
} elsif (not /^\d+$/) {
# https://tools.ietf.org/html/rfc4180
# but: https://stat.ethz.ch/pipermail/r-devel/2013-September/067603.html
# some R devs don't read RFCs and are stubborn... hence the extra
# condition
s/"/""/g;
$_ = qq{"$_"};
}
}
print @fields;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment