Skip to content

Instantly share code, notes, and snippets.

@tbbooher
Created March 13, 2025 23:18
Show Gist options
  • Save tbbooher/e51570ed62b5d3960dec6fcc1e9cb960 to your computer and use it in GitHub Desktop.
Save tbbooher/e51570ed62b5d3960dec6fcc1e9cb960 to your computer and use it in GitHub Desktop.
import re
import csv
from datetime import datetime
# Regex for Nginx combined log format
log_pattern = re.compile(
r'(?P<remote_addr>\S+) - - \[(?P<timestamp>.+?)\] "(?P<request>.+?)" '
r'(?P<status_code>\d+) (?P<bytes_sent>\d+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)" "(?P<upstream>.*?)"'
)
# Function to split request into method, URI, and protocol
def parse_request(request):
# Skip non-printable requests (likely binary or TLS handshake data)
if not all(32 <= ord(ch) < 127 for ch in request):
print(f"Non-text or binary request encountered: {request}")
return None, None, None
parts = request.split()
if len(parts) < 3:
# Log an error and return None values for error handling
print(f"Unexpected request format: {request}")
return None, None, None
return parts[0], " ".join(parts[1:-1]), parts[-1]
# Input and output files
log_file = "access.log"
csv_file = "nginx_logs.csv"
with open(log_file, 'r') as f, open(csv_file, 'w', newline='') as csv_out:
writer = csv.writer(csv_out)
# Write CSV header
writer.writerow([
"remote_addr", "timestamp", "request_method", "request_uri",
"request_protocol", "status_code", "bytes_sent", "referrer", "user_agent"
])
for line in f:
match = log_pattern.match(line.strip())
if match:
data = match.groupdict()
method, uri, protocol = parse_request(data['request'])
# Skip log entries with an unexpected or binary request format
if method is None:
continue
# Convert timestamp to ISO format; if there's an error, skip this line
try:
timestamp = datetime.strptime(data['timestamp'], '%d/%b/%Y:%H:%M:%S %z').isoformat()
except Exception as e:
print(f"Error parsing timestamp {data['timestamp']}: {e}")
continue
writer.writerow([
data['remote_addr'],
timestamp,
method,
uri,
protocol,
data['status_code'],
data['bytes_sent'],
data['referrer'] if data['referrer'] != '-' else None,
data['user_agent'] if data['user_agent'] != '-' else None
])
print(f"CSV file '{csv_file}' created successfully.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment