Skip to content

Instantly share code, notes, and snippets.

@wolph
Created March 11, 2025 08:35
Show Gist options
  • Save wolph/ae9176a9fa6a5e01ed25f550ba53bf3c to your computer and use it in GitHub Desktop.
Save wolph/ae9176a9fa6a5e01ed25f550ba53bf3c to your computer and use it in GitHub Desktop.
Script to split (large) existing nginx logs per year/month with colourful verbose output and progressbars.
#!/usr/bin/env python3
"""
Script to split existing nginx logs per year/month with verbose output.
This script accepts a configurable directory (defaulting to the current working
directory) of nginx logs. It looks for files named <domain>.access_log and
<domain>.error_log. Each log line is expected to contain a timestamp inside
square brackets, e.g., [10/Oct/2000:13:55:36 -0700]. The script extracts the
date and writes each line into a separate file under a subdirectory formatted
as YYYY-MM.
Verbose output is provided via rich progress bars and spinners.
Usage:
python split_nginx_logs.py --dir /path/to/logs
"""
import argparse
import re
from datetime import datetime
from pathlib import Path
from typing import TextIO, Dict, Optional
from rich.console import Console
from rich.progress import (
Progress,
SpinnerColumn,
BarColumn,
TextColumn,
TimeRemainingColumn,
TaskID,
)
# Regular expression to extract date part from log entry.
DATE_REGEX = re.compile(r'\[([\d]{2}/[A-Za-z]{3}/[\d]{4}):')
console = Console()
def process_log_file(
file_path: Path,
base_dir: Path,
progress: Optional[Progress] = None,
task_id: Optional[TaskID] = None,
) -> None:
"""Split a given nginx log file per year/month.
Reads the log file line by line, extracts the timestamp, and writes each
line to its corresponding output file. The output file is created under a
subdirectory in the base directory named as <year-month> and retains the
original file name.
Args:
file_path: Path to the original log file.
base_dir: Base directory for output files.
progress: Optional rich Progress instance for updating progress.
task_id: Optional rich TaskID for progress update.
"""
out_files: Dict[str, TextIO] = {}
try:
with file_path.open('r', encoding='utf-8') as fin:
for line in fin:
match = DATE_REGEX.search(line)
if match:
date_str = match.group(1) # e.g., '10/Oct/2000'
try:
date_obj = datetime.strptime(date_str, '%d/%b/%Y')
key = f'{date_obj.year}-{date_obj.month:02d}'
except ValueError:
key = 'unknown'
else:
key = 'unknown'
out_path = base_dir / key / file_path.name
if key not in out_files:
out_path.parent.mkdir(parents=True, exist_ok=True)
out_files[key] = out_path.open('w', encoding='utf-8')
out_files[key].write(line)
if progress and task_id is not None:
# Advance progress by the number of bytes processed
progress.advance(task_id, len(line.encode('utf-8')))
finally:
# Ensure all open file handles are closed.
for f in out_files.values():
f.close()
def split_nginx_logs(directory: Path) -> None:
"""Split all nginx log files in the given directory per year/month.
It looks for files named <domain>.access_log and <domain>.error_log in the
specified directory and processes them with verbose output.
Args:
directory: The directory where the log files are located.
"""
log_files = []
for pattern in ['*.access_log', '*.error_log']:
log_files.extend(list(directory.glob(pattern)))
if not log_files:
console.print('[bold red]No log files found.[/bold red]')
return
console.print(f'[bold green]Found {len(log_files)} log files.[/bold green]')
# Create a progress bar for file processing.
with Progress(
SpinnerColumn(),
'[progress.description]{task.description}',
BarColumn(),
'[progress.percentage]{task.percentage:>3.0f}%',
TimeRemainingColumn(),
console=console,
transient=True,
) as progress:
for log_file in log_files:
try:
file_size = log_file.stat().st_size
except Exception:
file_size = 0
task_description = f'Processing [cyan]{log_file.name}[/cyan]'
task_id = progress.add_task(task_description, total=file_size)
process_log_file(log_file, directory, progress, task_id)
progress.remove_task(task_id)
console.print(f'[green]Finished processing:[/green] {log_file}')
console.print('[bold blue]All files processed successfully.[/bold blue]')
def parse_arguments() -> argparse.Namespace:
"""Parse command-line arguments.
Returns:
The parsed arguments with the directory path.
"""
parser = argparse.ArgumentParser(
description='Split nginx logs per year/month with verbose logging.'
)
parser.add_argument(
'--dir',
type=str,
default='.',
help='Directory containing the nginx log files (default: current '
'working directory)',
)
return parser.parse_args()
def main() -> None:
"""Main entry point for splitting nginx logs."""
args = parse_arguments()
directory = Path(args.dir).resolve()
if not directory.exists() or not directory.is_dir():
console.print(
f'[bold red]Error:[/bold red] {directory} is not a valid directory.'
)
return
console.print('[bold green]Starting log splitting...[/bold green]')
split_nginx_logs(directory)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment