Skip to content

Instantly share code, notes, and snippets.

@rochacbruno
Created September 2, 2025 17:52
Show Gist options
  • Save rochacbruno/537ca8e166fd892dbd2e4a36943cb7e5 to your computer and use it in GitHub Desktop.
Save rochacbruno/537ca8e166fd892dbd2e4a36943cb7e5 to your computer and use it in GitHub Desktop.
Generate Random Huge 2GB Sample Log for Nginx
#!/usr/bin/env python3
import asyncio
import random
import datetime
import time
import os
from concurrent.futures import ThreadPoolExecutor
# Configuration
BATCH_SIZE = 10000
CONCURRENT_BATCHES = 10
WRITE_BUFFER_SIZE = 50 * 1024 * 1024 # 50MB
TARGET_SIZE_GB = 2
NUM_THREADS = 8
# Pre-generate data pools
IPS = [
f"{random.randint(1, 255)}.{random.randint(0, 255)}."
f"{random.randint(0, 255)}.{random.randint(1, 255)}"
for _ in range(1000)
]
METHODS = ["GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS"]
PATHS = [
"/",
"/api/users",
"/api/products",
"/api/orders",
"/api/auth",
"/login",
"/logout",
"/dashboard",
"/profile",
"/settings",
"/static/css/main.css",
"/static/js/app.js",
"/static/js/vendor.js",
"/images/logo.png",
"/images/banner.jpg",
"/favicon.ico",
"/api/v1/data",
"/api/v2/users",
"/health",
"/metrics",
"/admin/dashboard",
"/admin/users",
"/admin/settings",
"/products/1234",
"/products/5678",
"/cart",
"/checkout",
]
STATUS_CODES = [
200,
201,
204,
301,
302,
304,
400,
401,
403,
404,
500,
502,
503,
]
USER_AGENTS = [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15",
"curl/7.68.0",
"Python/3.9 aiohttp/3.7.4",
"PostmanRuntime/7.28.4",
]
REFERERS = [
"-",
"https://example.com",
"https://google.com",
"https://github.com",
]
def generate_log_lines(count: int) -> str:
"""Generate multiple log lines as a single string."""
lines = []
for _ in range(count):
ip = random.choice(IPS)
timestamp = datetime.datetime.now() - datetime.timedelta(
days=random.randint(0, 30),
hours=random.randint(0, 23),
minutes=random.randint(0, 59),
)
timestamp_str = timestamp.strftime("%d/%b/%Y:%H:%M:%S +0000")
method = random.choice(METHODS)
path = random.choice(PATHS)
status = random.choice(STATUS_CODES)
size = random.randint(100, 50000)
referer = random.choice(REFERERS)
user_agent = random.choice(USER_AGENTS)
response_time = round(random.uniform(0.001, 5.0), 3)
lines.append(
f'{ip} - - [{timestamp_str}] "{method} {path} HTTP/1.1" {status} '
f'{size} "{referer}" "{user_agent}" {response_time}\n'
)
return "".join(lines)
async def generate_batch(executor: ThreadPoolExecutor, batch_size: int) -> str:
"""Generate a batch of log lines using thread pool."""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(executor, generate_log_lines, batch_size)
async def write_to_file(file_handle, data: str, stats: dict):
"""Write data to file using thread pool."""
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, file_handle.write, data)
stats["bytes_written"] += len(data.encode("utf-8"))
stats["lines_written"] += data.count("\n")
async def generate_nginx_log(
filename: str = "nginx_sample.log", target_size_gb: float = 2
):
"""Main async function to generate the log file."""
target_size = int(target_size_gb * 1024 * 1024 * 1024)
stats = {"bytes_written": 0, "lines_written": 0}
print(f"Generating {target_size_gb}GB nginx log file...")
print(
f"Using {CONCURRENT_BATCHES} concur generators, {NUM_THREADS} threads"
)
start_time = time.time()
last_report_time = start_time
last_bytes = 0
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
with open(filename, "w", buffering=WRITE_BUFFER_SIZE) as f:
pending_tasks = []
while stats["bytes_written"] < target_size:
# Keep queue filled with concurrent batch generations
while (
len(pending_tasks) < CONCURRENT_BATCHES
and stats["bytes_written"] < target_size
):
task = asyncio.create_task(
generate_batch(executor, BATCH_SIZE)
)
pending_tasks.append(task)
# Wait for at least one batch to complete
done, pending = await asyncio.wait(
pending_tasks, return_when=asyncio.FIRST_COMPLETED
)
pending_tasks = list(pending)
# Write completed batches
for task in done:
batch_data = await task
await write_to_file(f, batch_data, stats)
# Progress reporting
current_time = time.time()
if current_time - last_report_time >= 0.5:
elapsed = current_time - start_time
speed = (stats["bytes_written"] - last_bytes) / (
(current_time - last_report_time) * 1024 * 1024
)
avg_speed = stats["bytes_written"] / (
elapsed * 1024 * 1024
)
progress = (stats["bytes_written"] / target_size) * 100
gb_written = stats["bytes_written"] / (1024**3)
print(
f"\rProgress: {progress:.1f}% ({gb_written:.2f}GB) | "
f"Spd: {speed:.0f} MB/s | Avg: {avg_speed:.0f} MB/s | "
f"Lines: {stats['lines_written']:,}",
end="",
flush=True,
)
last_report_time = current_time
last_bytes = stats["bytes_written"]
# Process any remaining tasks
if pending_tasks:
remaining = await asyncio.gather(*pending_tasks)
for batch_data in remaining:
if stats["bytes_written"] < target_size:
await write_to_file(f, batch_data, stats)
elapsed = time.time() - start_time
final_size = os.path.getsize(filename)
print(f"\n✓ Completed in {elapsed:.2f} seconds")
print(f"Final file size: {final_size / (1024**3):.2f}GB")
print(f"Average speed: {(final_size / (1024**2)) / elapsed:.0f} MB/s")
print(f"Total lines: {stats['lines_written']:,}")
async def main():
await generate_nginx_log("nginx_sample.log", 2)
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment