Last active
November 30, 2023 13:36
-
-
Save TimKraemer/6847dad78c3f45f06d5f05c83883853e to your computer and use it in GitHub Desktop.
create an access log report in csv based on nginx-proxy log files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import argparse | |
import re | |
from datetime import datetime | |
def extract_data(log_data, domain, poi): | |
# Adjust the regex to better handle the log line format | |
pattern_str = r'.*?\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4})\]' # Match timestamp | |
pattern_str += r'\s"GET\s(.*?)\sHTTP/.*?"' # Capture requested resource | |
pattern_str += r'\s\d{3}\s\d+\s"(.*?)"' # Capture referrer | |
pattern_str += r'\s"(.*?)"' # Capture user-agent | |
if domain: | |
domain_pattern = rf'{re.escape(domain)}' | |
pattern_str = domain_pattern + pattern_str | |
pattern = re.compile(pattern_str) | |
matches = pattern.search(log_data["log"]) | |
if matches: | |
request = matches.group(2) | |
if poi and not request.startswith(poi): | |
return None # Skip if request does not start with POI | |
timestamp = datetime.strptime(matches.group(1), "%d/%b/%Y:%H:%M:%S %z") | |
timestamp = timestamp.strftime("%d.%m.%Y %H:%M:%S") | |
referrer = matches.group(3) if matches.group(3) != '-' else '' # Replace '-' with '' | |
user_agent = matches.group(4) | |
return timestamp, request, referrer, user_agent | |
return None | |
def log_generator(input_file, domain, poi): | |
with open(input_file, 'r') as f: | |
for line in f: | |
try: | |
data = extract_data(json.loads(line), domain, poi) | |
if data is not None: | |
yield data | |
except json.JSONDecodeError: | |
print(f"Warning: Could not decode line: {line}") | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("input_file") | |
parser.add_argument("--out", default="output.csv") | |
parser.add_argument("--domain", default=None) | |
parser.add_argument("--poi", help="Point of Interest for filtering requests", default=None) | |
args = parser.parse_args() | |
with open(args.out, "w") as f: | |
writer = csv.writer(f) | |
writer.writerow(["time", "request", "referrer", "user-agent"]) # Include request | |
for data in log_generator(args.input_file, args.domain, args.poi): | |
writer.writerow(data) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
example usage:
find out where your containers log files are:
/var/lib/docker/containers/<container_id>/<container_id>-json.log
container_id you will find with
docker ps
then use this path as first parameter:
python3 create-domain-report.py /var/lib/docker/containers/e00fef44b877af46e86dac72a55fa91999f92b48822edf7e241b598080c84f39/e00fef44b877af46e86dac72a55fa91999f92b48822edf7e241b598080c84f39-json.log --out output.csv --domain your-domain.com --poi /your/sub/folder