Last active
September 25, 2017 19:45
-
-
Save skywodd/591bfd79e232d64ac7a49f09408220c4 to your computer and use it in GitHub Desktop.
Incremential backup script for NGINX logs files. Store log entries in a SQlite3 database. Ignore existing entries using SHA256.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Incremential backup script for NGINX logs files. | |
""" | |
import re | |
import sqlite3 | |
import argparse | |
from hashlib import sha256 | |
import pdb | |
import sys | |
import traceback | |
# Nginx access log format regex | |
nginx_access_re = re.compile(r'(?P<remote_addr>[^\s]+)\s-\s(?P<remote_user>[^\s]+)\s\[(?P<time_local>[^\]]+)\]\s"(?P<request>[^\"]*)"\s(?P<status>[^\s]+)\s(?P<body_bytes_sent>[^\s]+)\s"(?P<http_referer>[^\"]*)"\s"(?P<http_user_agent>[^\"]*)"') | |
datetime_re = re.compile(r'(?P<day>[0-9]{2})\/(?P<month>[A-Za-z]{3})\/(?P<year>[0-9]{4}):(?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2})\s(?P<timezone>[\+\-0-9]{5})') | |
# Nginx month labels | |
month_labels = { | |
'Jan': '01', | |
'Feb': '02', | |
'Mar': '03', | |
'Apr': '04', | |
'May': '05', | |
'Jun': '06', | |
'Jul': '07', | |
'Aug': '08', | |
'Sep': '09', | |
'Oct': '10', | |
'Nov': '11', | |
'Dec': '12', | |
} | |
def process_log(cursor, log_file, check_collision=False): | |
""" Read log file line by line and store content in database. """ | |
# Open the log file | |
print('Openning "{}"...'.format(log_file)) | |
with open(log_file) as fi: | |
counter = 0 | |
ignored = 0 | |
inserted = 0 | |
# Process each line | |
print('Processing log entries (may take some time)...') | |
for line in fi: | |
# Get line footprint | |
m = sha256() | |
m.update(line.encode()) | |
footprint = m.digest() | |
# Parse log line | |
m = nginx_access_re.match(line) | |
if m is None: | |
print('Error: Cannot parse line:') | |
print(line) | |
input('>>> Press enter to continue') | |
continue | |
# Unpack values | |
( | |
remote_addr, | |
remote_user, | |
time_local, | |
request, | |
status, | |
body_bytes_sent, | |
http_referer, | |
http_user_agent | |
) = m.groups() | |
# Fix missing value | |
if remote_user == '-': | |
remote_user = '' | |
if http_referer == '-': | |
http_referer = '' | |
if http_user_agent == '-': | |
http_user_agent = '' | |
# Unpack request | |
parts = request.split() | |
if len(parts) >= 2: | |
request_cmd, request_path, *_ = parts | |
else: | |
request_cmd = '' | |
request_path = request | |
# Parse datetime | |
day, month, year, hours, minutes, seconds, timezone = datetime_re.match(time_local).groups() | |
timezone = timezone[:3] + ':' + timezone[-2:] | |
time_local = '%s-%s-%s %s:%s:%s%s' % (year, month_labels[month], day, hours, minutes, seconds, timezone) | |
# Store everything | |
cursor.execute("INSERT OR IGNORE INTO logs VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", | |
(footprint, remote_addr, remote_user, time_local, request_cmd, request_path, | |
status, body_bytes_sent, http_referer, http_user_agent)) | |
if cursor.rowcount: | |
inserted += 1 | |
else: | |
# Detect SHA collision | |
if check_collision: | |
cursor.execute("SELECT * FROM logs WHERE footprint=?", (footprint, )) | |
row = cursor.fetchone() | |
if row['remote_addr'] != remote_addr \ | |
or row['remote_user'] != remote_user \ | |
or row['time_local'] != time_local \ | |
or row['request_cmd'] != request_cmd \ | |
or row['request_path'] != request_path \ | |
or str(row['status']) != status \ | |
or str(row['body_bytes_sent']) != body_bytes_sent \ | |
or row['http_referer'] != http_referer \ | |
or row['http_user_agent'] != http_user_agent: | |
print('Warning: collision found!') | |
print('Expected:\n', | |
footprint, remote_addr, remote_user, time_local, | |
request_cmd, request_path, status, body_bytes_sent, | |
http_referer, http_user_agent) | |
print('Got:\n', | |
row['footprint'], row['remote_addr'], row['remote_user'], row['time_local'], | |
row['request_cmd'], row['request_path'], row['status'], row['body_bytes_sent'], | |
row['http_referer'], row['http_user_agent']) | |
input('>>> Press enter to continue') | |
ignored += 1 | |
# Update counter | |
counter += 1 | |
if counter % 1024 == 0: | |
print('.', end='', flush=True) | |
# End stats | |
print() | |
print('Total: {} entries processed, {} inserted, {} ignored.'.format(counter, inserted, ignored)) | |
return counter, inserted, ignored | |
def process_logs(database_file, *log_files, check_collision=False): | |
# Open the database | |
print('Openning database file...') | |
conn = sqlite3.connect(database_file) | |
conn.row_factory = sqlite3.Row | |
c = conn.cursor() | |
# Create the table | |
print('Create database scheme...') | |
c.execute( | |
'''CREATE TABLE IF NOT EXISTS logs ( | |
footprint BLOB UNIQUE, | |
remote_addr TEXT, | |
remote_user TEXT, | |
time_local TEXT, | |
request_cmd TEXT, | |
request_path TEXT, | |
status INTEGER, | |
body_bytes_sent INTEGER, | |
http_referer TEXT, | |
http_user_agent TEXT)''' | |
) | |
# Process each file | |
counter = inserted = ignored = 0 | |
for log_file in log_files: | |
_counter, _inserted, _ignored = process_log(c, log_file, check_collision) | |
counter += _counter | |
inserted += _inserted | |
ignored += _ignored | |
print('Processing done.') | |
print('Grand total: {} entries processed, {} inserted, {} ignored.'.format(counter, inserted, ignored)) | |
# Save the changes | |
print('Saving the database...') | |
conn.commit() | |
# Close the connection | |
conn.close() | |
# Main entry point | |
if __name__ == '__main__': | |
# CLI interface | |
parser = argparse.ArgumentParser(description='Update web logs database.', | |
epilog='N.B. Duplicate log entries are ignored.') | |
parser.add_argument('database', metavar='DATABASE', | |
help='The SQlite database file path to work with.') | |
parser.add_argument('log_files', metavar='FILE', nargs='+', | |
help='All log files path to be processed.') | |
parser.add_argument('-c', '--check-collision', action='store_true', | |
help='Enable full collision checking for existing entries in the database.') | |
args = parser.parse_args() | |
# Run the program | |
try: | |
process_logs(args.database, *args.log_files, | |
check_collision=args.check_collision) | |
except: | |
type, value, tb = sys.exc_info() | |
traceback.print_exc() | |
pdb.post_mortem(tb) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment