Skip to content

Instantly share code, notes, and snippets.

@TimKraemer
Last active November 30, 2023 13:36
Show Gist options
  • Save TimKraemer/6847dad78c3f45f06d5f05c83883853e to your computer and use it in GitHub Desktop.
Save TimKraemer/6847dad78c3f45f06d5f05c83883853e to your computer and use it in GitHub Desktop.
create an access log report in csv based on nginx-proxy log files
import json
import csv
import argparse
import re
from datetime import datetime
def extract_data(log_data, domain, poi):
# Adjust the regex to better handle the log line format
pattern_str = r'.*?\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4})\]' # Match timestamp
pattern_str += r'\s"GET\s(.*?)\sHTTP/.*?"' # Capture requested resource
pattern_str += r'\s\d{3}\s\d+\s"(.*?)"' # Capture referrer
pattern_str += r'\s"(.*?)"' # Capture user-agent
if domain:
domain_pattern = rf'{re.escape(domain)}'
pattern_str = domain_pattern + pattern_str
pattern = re.compile(pattern_str)
matches = pattern.search(log_data["log"])
if matches:
request = matches.group(2)
if poi and not request.startswith(poi):
return None # Skip if request does not start with POI
timestamp = datetime.strptime(matches.group(1), "%d/%b/%Y:%H:%M:%S %z")
timestamp = timestamp.strftime("%d.%m.%Y %H:%M:%S")
referrer = matches.group(3) if matches.group(3) != '-' else '' # Replace '-' with ''
user_agent = matches.group(4)
return timestamp, request, referrer, user_agent
return None
def log_generator(input_file, domain, poi):
with open(input_file, 'r') as f:
for line in f:
try:
data = extract_data(json.loads(line), domain, poi)
if data is not None:
yield data
except json.JSONDecodeError:
print(f"Warning: Could not decode line: {line}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input_file")
parser.add_argument("--out", default="output.csv")
parser.add_argument("--domain", default=None)
parser.add_argument("--poi", help="Point of Interest for filtering requests", default=None)
args = parser.parse_args()
with open(args.out, "w") as f:
writer = csv.writer(f)
writer.writerow(["time", "request", "referrer", "user-agent"]) # Include request
for data in log_generator(args.input_file, args.domain, args.poi):
writer.writerow(data)
if __name__ == "__main__":
main()
@TimKraemer
Copy link
Author

TimKraemer commented Aug 16, 2023

example usage:

find out where your containers log files are:
/var/lib/docker/containers/<container_id>/<container_id>-json.log

container_id you will find with docker ps

then use this path as first parameter:

python3 create-domain-report.py /var/lib/docker/containers/e00fef44b877af46e86dac72a55fa91999f92b48822edf7e241b598080c84f39/e00fef44b877af46e86dac72a55fa91999f92b48822edf7e241b598080c84f39-json.log --out output.csv --domain your-domain.com --poi /your/sub/folder

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment