Created
March 13, 2025 23:18
-
-
Save tbbooher/e51570ed62b5d3960dec6fcc1e9cb960 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import csv | |
from datetime import datetime | |
# Regex for Nginx combined log format | |
log_pattern = re.compile( | |
r'(?P<remote_addr>\S+) - - \[(?P<timestamp>.+?)\] "(?P<request>.+?)" ' | |
r'(?P<status_code>\d+) (?P<bytes_sent>\d+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)" "(?P<upstream>.*?)"' | |
) | |
# Function to split request into method, URI, and protocol | |
def parse_request(request): | |
# Skip non-printable requests (likely binary or TLS handshake data) | |
if not all(32 <= ord(ch) < 127 for ch in request): | |
print(f"Non-text or binary request encountered: {request}") | |
return None, None, None | |
parts = request.split() | |
if len(parts) < 3: | |
# Log an error and return None values for error handling | |
print(f"Unexpected request format: {request}") | |
return None, None, None | |
return parts[0], " ".join(parts[1:-1]), parts[-1] | |
# Input and output files | |
log_file = "access.log" | |
csv_file = "nginx_logs.csv" | |
with open(log_file, 'r') as f, open(csv_file, 'w', newline='') as csv_out: | |
writer = csv.writer(csv_out) | |
# Write CSV header | |
writer.writerow([ | |
"remote_addr", "timestamp", "request_method", "request_uri", | |
"request_protocol", "status_code", "bytes_sent", "referrer", "user_agent" | |
]) | |
for line in f: | |
match = log_pattern.match(line.strip()) | |
if match: | |
data = match.groupdict() | |
method, uri, protocol = parse_request(data['request']) | |
# Skip log entries with an unexpected or binary request format | |
if method is None: | |
continue | |
# Convert timestamp to ISO format; if there's an error, skip this line | |
try: | |
timestamp = datetime.strptime(data['timestamp'], '%d/%b/%Y:%H:%M:%S %z').isoformat() | |
except Exception as e: | |
print(f"Error parsing timestamp {data['timestamp']}: {e}") | |
continue | |
writer.writerow([ | |
data['remote_addr'], | |
timestamp, | |
method, | |
uri, | |
protocol, | |
data['status_code'], | |
data['bytes_sent'], | |
data['referrer'] if data['referrer'] != '-' else None, | |
data['user_agent'] if data['user_agent'] != '-' else None | |
]) | |
print(f"CSV file '{csv_file}' created successfully.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment