Last active
February 14, 2023 10:15
-
-
Save dgulinobw/7a986a350ae7cffeeac7de2658e164b4 to your computer and use it in GitHub Desktop.
Convert Nginx combinded access log to CSV format, splitting path and parameters into columns, in a memory-efficient manner.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# nginx_accesslog2csv: Convert nginx default, combined access log to CSV format | |
import os | |
import csv | |
import re | |
import sys | |
import datetime | |
import traceback | |
#nginx default | |
nginx_default_pattern = re.compile(r'(\d+.\d+.\d+.\d+)\s-\s-\s\[(.+)\]\s"(.*)\s\/(.*)\s(......\..)"\s([0-9]*)\s([0-9]*)\s"(.*)"\s"(.*)"\s"(.*)"') | |
nginx_default_static_headers = ['host', 'time', 'verb', 'url', 'httpver', 'status', 'size', 'referer', 'useragent', 'gzip_ratio'] | |
#combined+ for auth | |
combined_pattern = pattern = re.compile(r'^(\S+) (\S+) (\S+) \[(.*?)\] "(\S+)(.*)? (.*)?" (\d+) (\d+) "(.*?)" "(.*?)" "(.*?)" ([0-9]*\.[0-9]*) (.)') | |
combined_static_headers = ['host', 'username', 'identity', 'time', 'verb', 'url', 'httpver', 'status', 'size', 'referer', 'useragent', 'x_host', 'responsetime', 'timeunit'] | |
pattern = combined_pattern | |
static_headers = combined_static_headers | |
time_field = static_headers.index('time') | |
url_field = static_headers.index('url') | |
static_headers.insert(time_field,'timestamp') | |
sys.stderr.write(f"pattern: {pattern}\n") | |
keys = ["path"] | |
def split_url(url): | |
line_l = [] | |
line_d = {} | |
sp = url.split("?") | |
base_url = sp[0] | |
path = base_url.split("&")[0] | |
if len(sp) > 1: | |
parms = sp[1].split("&") | |
else: | |
parms = [] | |
line_d["path"] = path | |
for count,p in enumerate(path.split("/")): | |
pathname = "path_" + str(count + 1) | |
if not pathname in keys: | |
keys.append(pathname) | |
line_d[pathname] = p | |
for parm in parms: | |
kv = parm.split("=") | |
k = kv[0] | |
if not k in keys: | |
keys.append(k) | |
v = kv[1] | |
line_d[k] = v | |
for key in keys: | |
if line_d.get(key): | |
line_l.append(line_d[key]) | |
else: | |
line_l.append("") | |
return line_l | |
def prepend_line(file_name, line): | |
""" Insert given string as a new line at the beginning of a file """ | |
# define name of temporary dummy file | |
dummy_file = file_name + '.bak' | |
# open original file in read mode and dummy file in write mode | |
with open(file_name, 'r') as read_obj, open(dummy_file, 'w') as write_obj: | |
# Write given line to the dummy file | |
write_obj.write(line + '\n') | |
# Read lines from original file one by one and append them to the dummy file | |
for line in read_obj: | |
write_obj.write(line) | |
# remove original file | |
os.remove(file_name) | |
# Rename dummy file as the original file | |
os.rename(dummy_file, file_name) | |
def convert_line(line, csv_out): | |
line = line.strip() | |
m = pattern.match(line) | |
result = m.groups() | |
time = result[time_field] | |
timestamp = datetime.datetime.strptime(time, '%d/%b/%Y:%H:%M:%S %z').strftime("%m/%d/%y %H:%M:%S") | |
l = list(result) | |
l[time_field] = time | |
l.insert(time_field,timestamp) | |
parameters = split_url(result[url_field]) | |
l_p = l + parameters | |
csv_out.writerow(l_p) | |
def convert_file(log_file_name,csv_file_name): | |
file = open(log_file_name) | |
with open(csv_file_name, 'w') as out: | |
csv_out=csv.writer(out, delimiter=',') | |
for line in file: | |
try: | |
convert_line(line,csv_out) | |
except (AttributeError,IndexError) as e: | |
#sys.stderr.write(str(e) + "\n") | |
sys.stderr.write("Error parsing line: ") | |
sys.stderr.write(line + "\n") | |
def convert_stream(): | |
csv_out = csv.writer(sys.stdout, delimiter=',') | |
while 1: | |
try: | |
line = sys.stdin.readline() | |
if not line: | |
break | |
convert_line(line,csv_out) | |
except KeyboardInterrupt as ke: | |
dynamic_headers = keys | |
headers = static_headers + dynamic_headers | |
headers = ",".join(headers) | |
sys.stderr.write("header:\n") | |
sys.stdout.write(headers + "\n") | |
try: | |
sys.exit(130) | |
except: | |
os._exit(0) | |
except (AttributeError,IndexError) as e: | |
sys.stderr.write("Error parsing line: ") | |
sys.stderr.write(line + "\n") | |
dynamic_headers = keys | |
headers = static_headers + dynamic_headers | |
headers = ",".join(headers) | |
sys.stdout.write(headers) | |
def usage(progname): | |
sys.stderr.write( | |
f"""Usage: {progname} <access.log> <accesslog.csv> | |
Or pipe stdout in: 'cat <file> | {progname}'""") | |
def version(): | |
sys.stdout.write("version: 1.1\n") | |
def main(argv, stdout, environ): | |
progname = argv[0] | |
if sys.stdin.isatty(): | |
if len(argv) == 3: | |
log_file_name = sys.argv[1] | |
csv_file_name = sys.argv[2] | |
convert_file(log_file_name, csv_file_name) | |
else: | |
version() | |
usage(progname) | |
sys.exit(0) | |
else: | |
convert_stream() | |
if __name__ == "__main__": | |
main(sys.argv, sys.stdout, os.environ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment