Skip to content

Instantly share code, notes, and snippets.

@dgulinobw
Last active February 14, 2023 10:15
Show Gist options
  • Save dgulinobw/7a986a350ae7cffeeac7de2658e164b4 to your computer and use it in GitHub Desktop.
Save dgulinobw/7a986a350ae7cffeeac7de2658e164b4 to your computer and use it in GitHub Desktop.
Convert Nginx combinded access log to CSV format, splitting path and parameters into columns, in a memory-efficient manner.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# nginx_accesslog2csv: Convert nginx default, combined access log to CSV format
import os
import csv
import re
import sys
import datetime
import traceback
#nginx default
nginx_default_pattern = re.compile(r'(\d+.\d+.\d+.\d+)\s-\s-\s\[(.+)\]\s"(.*)\s\/(.*)\s(......\..)"\s([0-9]*)\s([0-9]*)\s"(.*)"\s"(.*)"\s"(.*)"')
nginx_default_static_headers = ['host', 'time', 'verb', 'url', 'httpver', 'status', 'size', 'referer', 'useragent', 'gzip_ratio']
#combined+ for auth
combined_pattern = pattern = re.compile(r'^(\S+) (\S+) (\S+) \[(.*?)\] "(\S+)(.*)? (.*)?" (\d+) (\d+) "(.*?)" "(.*?)" "(.*?)" ([0-9]*\.[0-9]*) (.)')
combined_static_headers = ['host', 'username', 'identity', 'time', 'verb', 'url', 'httpver', 'status', 'size', 'referer', 'useragent', 'x_host', 'responsetime', 'timeunit']
pattern = combined_pattern
static_headers = combined_static_headers
time_field = static_headers.index('time')
url_field = static_headers.index('url')
static_headers.insert(time_field,'timestamp')
sys.stderr.write(f"pattern: {pattern}\n")
keys = ["path"]
def split_url(url):
line_l = []
line_d = {}
sp = url.split("?")
base_url = sp[0]
path = base_url.split("&")[0]
if len(sp) > 1:
parms = sp[1].split("&")
else:
parms = []
line_d["path"] = path
for count,p in enumerate(path.split("/")):
pathname = "path_" + str(count + 1)
if not pathname in keys:
keys.append(pathname)
line_d[pathname] = p
for parm in parms:
kv = parm.split("=")
k = kv[0]
if not k in keys:
keys.append(k)
v = kv[1]
line_d[k] = v
for key in keys:
if line_d.get(key):
line_l.append(line_d[key])
else:
line_l.append("")
return line_l
def prepend_line(file_name, line):
""" Insert given string as a new line at the beginning of a file """
# define name of temporary dummy file
dummy_file = file_name + '.bak'
# open original file in read mode and dummy file in write mode
with open(file_name, 'r') as read_obj, open(dummy_file, 'w') as write_obj:
# Write given line to the dummy file
write_obj.write(line + '\n')
# Read lines from original file one by one and append them to the dummy file
for line in read_obj:
write_obj.write(line)
# remove original file
os.remove(file_name)
# Rename dummy file as the original file
os.rename(dummy_file, file_name)
def convert_line(line, csv_out):
line = line.strip()
m = pattern.match(line)
result = m.groups()
time = result[time_field]
timestamp = datetime.datetime.strptime(time, '%d/%b/%Y:%H:%M:%S %z').strftime("%m/%d/%y %H:%M:%S")
l = list(result)
l[time_field] = time
l.insert(time_field,timestamp)
parameters = split_url(result[url_field])
l_p = l + parameters
csv_out.writerow(l_p)
def convert_file(log_file_name,csv_file_name):
file = open(log_file_name)
with open(csv_file_name, 'w') as out:
csv_out=csv.writer(out, delimiter=',')
for line in file:
try:
convert_line(line,csv_out)
except (AttributeError,IndexError) as e:
#sys.stderr.write(str(e) + "\n")
sys.stderr.write("Error parsing line: ")
sys.stderr.write(line + "\n")
def convert_stream():
csv_out = csv.writer(sys.stdout, delimiter=',')
while 1:
try:
line = sys.stdin.readline()
if not line:
break
convert_line(line,csv_out)
except KeyboardInterrupt as ke:
dynamic_headers = keys
headers = static_headers + dynamic_headers
headers = ",".join(headers)
sys.stderr.write("header:\n")
sys.stdout.write(headers + "\n")
try:
sys.exit(130)
except:
os._exit(0)
except (AttributeError,IndexError) as e:
sys.stderr.write("Error parsing line: ")
sys.stderr.write(line + "\n")
dynamic_headers = keys
headers = static_headers + dynamic_headers
headers = ",".join(headers)
sys.stdout.write(headers)
def usage(progname):
sys.stderr.write(
f"""Usage: {progname} <access.log> <accesslog.csv>
Or pipe stdout in: 'cat <file> | {progname}'""")
def version():
sys.stdout.write("version: 1.1\n")
def main(argv, stdout, environ):
progname = argv[0]
if sys.stdin.isatty():
if len(argv) == 3:
log_file_name = sys.argv[1]
csv_file_name = sys.argv[2]
convert_file(log_file_name, csv_file_name)
else:
version()
usage(progname)
sys.exit(0)
else:
convert_stream()
if __name__ == "__main__":
main(sys.argv, sys.stdout, os.environ)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment