Created
March 11, 2025 08:35
-
-
Save wolph/ae9176a9fa6a5e01ed25f550ba53bf3c to your computer and use it in GitHub Desktop.
Script to split (large) existing nginx logs per year/month with colourful verbose output and progressbars.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Script to split existing nginx logs per year/month with verbose output. | |
This script accepts a configurable directory (defaulting to the current working | |
directory) of nginx logs. It looks for files named <domain>.access_log and | |
<domain>.error_log. Each log line is expected to contain a timestamp inside | |
square brackets, e.g., [10/Oct/2000:13:55:36 -0700]. The script extracts the | |
date and writes each line into a separate file under a subdirectory formatted | |
as YYYY-MM. | |
Verbose output is provided via rich progress bars and spinners. | |
Usage: | |
python split_nginx_logs.py --dir /path/to/logs | |
""" | |
import argparse | |
import re | |
from datetime import datetime | |
from pathlib import Path | |
from typing import TextIO, Dict, Optional | |
from rich.console import Console | |
from rich.progress import ( | |
Progress, | |
SpinnerColumn, | |
BarColumn, | |
TextColumn, | |
TimeRemainingColumn, | |
TaskID, | |
) | |
# Regular expression to extract date part from log entry. | |
DATE_REGEX = re.compile(r'\[([\d]{2}/[A-Za-z]{3}/[\d]{4}):') | |
console = Console() | |
def process_log_file( | |
file_path: Path, | |
base_dir: Path, | |
progress: Optional[Progress] = None, | |
task_id: Optional[TaskID] = None, | |
) -> None: | |
"""Split a given nginx log file per year/month. | |
Reads the log file line by line, extracts the timestamp, and writes each | |
line to its corresponding output file. The output file is created under a | |
subdirectory in the base directory named as <year-month> and retains the | |
original file name. | |
Args: | |
file_path: Path to the original log file. | |
base_dir: Base directory for output files. | |
progress: Optional rich Progress instance for updating progress. | |
task_id: Optional rich TaskID for progress update. | |
""" | |
out_files: Dict[str, TextIO] = {} | |
try: | |
with file_path.open('r', encoding='utf-8') as fin: | |
for line in fin: | |
match = DATE_REGEX.search(line) | |
if match: | |
date_str = match.group(1) # e.g., '10/Oct/2000' | |
try: | |
date_obj = datetime.strptime(date_str, '%d/%b/%Y') | |
key = f'{date_obj.year}-{date_obj.month:02d}' | |
except ValueError: | |
key = 'unknown' | |
else: | |
key = 'unknown' | |
out_path = base_dir / key / file_path.name | |
if key not in out_files: | |
out_path.parent.mkdir(parents=True, exist_ok=True) | |
out_files[key] = out_path.open('w', encoding='utf-8') | |
out_files[key].write(line) | |
if progress and task_id is not None: | |
# Advance progress by the number of bytes processed | |
progress.advance(task_id, len(line.encode('utf-8'))) | |
finally: | |
# Ensure all open file handles are closed. | |
for f in out_files.values(): | |
f.close() | |
def split_nginx_logs(directory: Path) -> None: | |
"""Split all nginx log files in the given directory per year/month. | |
It looks for files named <domain>.access_log and <domain>.error_log in the | |
specified directory and processes them with verbose output. | |
Args: | |
directory: The directory where the log files are located. | |
""" | |
log_files = [] | |
for pattern in ['*.access_log', '*.error_log']: | |
log_files.extend(list(directory.glob(pattern))) | |
if not log_files: | |
console.print('[bold red]No log files found.[/bold red]') | |
return | |
console.print(f'[bold green]Found {len(log_files)} log files.[/bold green]') | |
# Create a progress bar for file processing. | |
with Progress( | |
SpinnerColumn(), | |
'[progress.description]{task.description}', | |
BarColumn(), | |
'[progress.percentage]{task.percentage:>3.0f}%', | |
TimeRemainingColumn(), | |
console=console, | |
transient=True, | |
) as progress: | |
for log_file in log_files: | |
try: | |
file_size = log_file.stat().st_size | |
except Exception: | |
file_size = 0 | |
task_description = f'Processing [cyan]{log_file.name}[/cyan]' | |
task_id = progress.add_task(task_description, total=file_size) | |
process_log_file(log_file, directory, progress, task_id) | |
progress.remove_task(task_id) | |
console.print(f'[green]Finished processing:[/green] {log_file}') | |
console.print('[bold blue]All files processed successfully.[/bold blue]') | |
def parse_arguments() -> argparse.Namespace: | |
"""Parse command-line arguments. | |
Returns: | |
The parsed arguments with the directory path. | |
""" | |
parser = argparse.ArgumentParser( | |
description='Split nginx logs per year/month with verbose logging.' | |
) | |
parser.add_argument( | |
'--dir', | |
type=str, | |
default='.', | |
help='Directory containing the nginx log files (default: current ' | |
'working directory)', | |
) | |
return parser.parse_args() | |
def main() -> None: | |
"""Main entry point for splitting nginx logs.""" | |
args = parse_arguments() | |
directory = Path(args.dir).resolve() | |
if not directory.exists() or not directory.is_dir(): | |
console.print( | |
f'[bold red]Error:[/bold red] {directory} is not a valid directory.' | |
) | |
return | |
console.print('[bold green]Starting log splitting...[/bold green]') | |
split_nginx_logs(directory) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment