rochacbruno · September 2, 2025 17:52
diff --git a/generate_log.py b/generate_log.py
 #!/usr/bin/env python3
 import asyncio
 import random
 import datetime
 import time
 import os
 from concurrent.futures import ThreadPoolExecutor

 # Configuration
 BATCH_SIZE = 10000
 CONCURRENT_BATCHES = 10
 WRITE_BUFFER_SIZE = 50 * 1024 * 1024  # 50MB
 TARGET_SIZE_GB = 2
 NUM_THREADS = 8

 # Pre-generate data pools
 IPS = [
    f"{random.randint(1, 255)}.{random.randint(0, 255)}."
    f"{random.randint(0, 255)}.{random.randint(1, 255)}"
    for _ in range(1000)
 ]
 METHODS = ["GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS"]
 PATHS = [
    "/",
    "/api/users",
    "/api/products",
    "/api/orders",
    "/api/auth",
    "/login",
    "/logout",
    "/dashboard",
    "/profile",
    "/settings",
    "/static/css/main.css",
    "/static/js/app.js",
    "/static/js/vendor.js",
    "/images/logo.png",
    "/images/banner.jpg",
    "/favicon.ico",
    "/api/v1/data",
    "/api/v2/users",
    "/health",
    "/metrics",
    "/admin/dashboard",
    "/admin/users",
    "/admin/settings",
    "/products/1234",
    "/products/5678",
    "/cart",
    "/checkout",
 ]
 STATUS_CODES = [
    200,
    201,
    204,
    301,
    302,
    304,
    400,
    401,
    403,
    404,
    500,
    502,
    503,
 ]
 USER_AGENTS = [
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15",
    "curl/7.68.0",
    "Python/3.9 aiohttp/3.7.4",
    "PostmanRuntime/7.28.4",
 ]
 REFERERS = [
    "-",
    "https://example.com",
    "https://google.com",
    "https://github.com",
 ]


 def generate_log_lines(count: int) -> str:
    """Generate multiple log lines as a single string."""
    lines = []
    for _ in range(count):
        ip = random.choice(IPS)
        timestamp = datetime.datetime.now() - datetime.timedelta(
            days=random.randint(0, 30),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59),
        )
        timestamp_str = timestamp.strftime("%d/%b/%Y:%H:%M:%S +0000")
        method = random.choice(METHODS)
        path = random.choice(PATHS)
        status = random.choice(STATUS_CODES)
        size = random.randint(100, 50000)
        referer = random.choice(REFERERS)
        user_agent = random.choice(USER_AGENTS)
        response_time = round(random.uniform(0.001, 5.0), 3)

        lines.append(
            f'{ip} - - [{timestamp_str}] "{method} {path} HTTP/1.1" {status} '
            f'{size} "{referer}" "{user_agent}" {response_time}\n'
        )

    return "".join(lines)


 async def generate_batch(executor: ThreadPoolExecutor, batch_size: int) -> str:
    """Generate a batch of log lines using thread pool."""
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(executor, generate_log_lines, batch_size)


 async def write_to_file(file_handle, data: str, stats: dict):
    """Write data to file using thread pool."""
    loop = asyncio.get_event_loop()
    await loop.run_in_executor(None, file_handle.write, data)
    stats["bytes_written"] += len(data.encode("utf-8"))
    stats["lines_written"] += data.count("\n")


 async def generate_nginx_log(
    filename: str = "nginx_sample.log", target_size_gb: float = 2
 ):
    """Main async function to generate the log file."""
    target_size = int(target_size_gb * 1024 * 1024 * 1024)
    stats = {"bytes_written": 0, "lines_written": 0}

    print(f"Generating {target_size_gb}GB nginx log file...")
    print(
        f"Using {CONCURRENT_BATCHES} concur generators, {NUM_THREADS} threads"
    )

    start_time = time.time()
    last_report_time = start_time
    last_bytes = 0

    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        with open(filename, "w", buffering=WRITE_BUFFER_SIZE) as f:
            pending_tasks = []

            while stats["bytes_written"] < target_size:
                # Keep queue filled with concurrent batch generations
                while (
                    len(pending_tasks) < CONCURRENT_BATCHES
                    and stats["bytes_written"] < target_size
                ):
                    task = asyncio.create_task(
                        generate_batch(executor, BATCH_SIZE)
                    )
                    pending_tasks.append(task)

                # Wait for at least one batch to complete
                done, pending = await asyncio.wait(
                    pending_tasks, return_when=asyncio.FIRST_COMPLETED
                )
                pending_tasks = list(pending)

                # Write completed batches
                for task in done:
                    batch_data = await task
                    await write_to_file(f, batch_data, stats)

                # Progress reporting
                current_time = time.time()
                if current_time - last_report_time >= 0.5:
                    elapsed = current_time - start_time
                    speed = (stats["bytes_written"] - last_bytes) / (
                        (current_time - last_report_time) * 1024 * 1024
                    )
                    avg_speed = stats["bytes_written"] / (
                        elapsed * 1024 * 1024
                    )
                    progress = (stats["bytes_written"] / target_size) * 100
                    gb_written = stats["bytes_written"] / (1024**3)

                    print(
                        f"\rProgress: {progress:.1f}% ({gb_written:.2f}GB) | "
                        f"Spd: {speed:.0f} MB/s | Avg: {avg_speed:.0f} MB/s | "
                        f"Lines: {stats['lines_written']:,}",
                        end="",
                        flush=True,
                    )

                    last_report_time = current_time
                    last_bytes = stats["bytes_written"]

            # Process any remaining tasks
            if pending_tasks:
                remaining = await asyncio.gather(*pending_tasks)
                for batch_data in remaining:
                    if stats["bytes_written"] < target_size:
                        await write_to_file(f, batch_data, stats)

    elapsed = time.time() - start_time
    final_size = os.path.getsize(filename)

    print(f"\n✓ Completed in {elapsed:.2f} seconds")
    print(f"Final file size: {final_size / (1024**3):.2f}GB")
    print(f"Average speed: {(final_size / (1024**2)) / elapsed:.0f} MB/s")
    print(f"Total lines: {stats['lines_written']:,}")


 async def main():
    await generate_nginx_log("nginx_sample.log", 2)


 if __name__ == "__main__":
    asyncio.run(main())
	#!/usr/bin/env python3
	import asyncio
	import random
	import datetime
	import time
	import os
	from concurrent.futures import ThreadPoolExecutor

	# Configuration
	BATCH_SIZE = 10000
	CONCURRENT_BATCHES = 10
	WRITE_BUFFER_SIZE = 50 * 1024 * 1024 # 50MB
	TARGET_SIZE_GB = 2
	NUM_THREADS = 8

	# Pre-generate data pools
	IPS = [
	f"{random.randint(1, 255)}.{random.randint(0, 255)}."
	f"{random.randint(0, 255)}.{random.randint(1, 255)}"
	for _ in range(1000)
	]
	METHODS = ["GET", "POST", "PUT", "DELETE", "HEAD", "OPTIONS"]
	PATHS = [
	"/",
	"/api/users",
	"/api/products",
	"/api/orders",
	"/api/auth",
	"/login",
	"/logout",
	"/dashboard",
	"/profile",
	"/settings",
	"/static/css/main.css",
	"/static/js/app.js",
	"/static/js/vendor.js",
	"/images/logo.png",
	"/images/banner.jpg",
	"/favicon.ico",
	"/api/v1/data",
	"/api/v2/users",
	"/health",
	"/metrics",
	"/admin/dashboard",
	"/admin/users",
	"/admin/settings",
	"/products/1234",
	"/products/5678",
	"/cart",
	"/checkout",
	]
	STATUS_CODES = [
	200,
	201,
	204,
	301,
	302,
	304,
	400,
	401,
	403,
	404,
	500,
	502,
	503,
	]
	USER_AGENTS = [
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/605.1.15",
	"curl/7.68.0",
	"Python/3.9 aiohttp/3.7.4",
	"PostmanRuntime/7.28.4",
	]
	REFERERS = [
	"-",
	"https://example.com",
	"https://google.com",
	"https://github.com",
	]


	def generate_log_lines(count: int) -> str:
	"""Generate multiple log lines as a single string."""
	lines = []
	for _ in range(count):
	ip = random.choice(IPS)
	timestamp = datetime.datetime.now() - datetime.timedelta(
	days=random.randint(0, 30),
	hours=random.randint(0, 23),
	minutes=random.randint(0, 59),
	)
	timestamp_str = timestamp.strftime("%d/%b/%Y:%H:%M:%S +0000")
	method = random.choice(METHODS)
	path = random.choice(PATHS)
	status = random.choice(STATUS_CODES)
	size = random.randint(100, 50000)
	referer = random.choice(REFERERS)
	user_agent = random.choice(USER_AGENTS)
	response_time = round(random.uniform(0.001, 5.0), 3)

	lines.append(
	f'{ip} - - [{timestamp_str}] "{method} {path} HTTP/1.1" {status} '
	f'{size} "{referer}" "{user_agent}" {response_time}\n'
	)

	return "".join(lines)


	async def generate_batch(executor: ThreadPoolExecutor, batch_size: int) -> str:
	"""Generate a batch of log lines using thread pool."""
	loop = asyncio.get_event_loop()
	return await loop.run_in_executor(executor, generate_log_lines, batch_size)


	async def write_to_file(file_handle, data: str, stats: dict):
	"""Write data to file using thread pool."""
	loop = asyncio.get_event_loop()
	await loop.run_in_executor(None, file_handle.write, data)
	stats["bytes_written"] += len(data.encode("utf-8"))
	stats["lines_written"] += data.count("\n")


	async def generate_nginx_log(
	filename: str = "nginx_sample.log", target_size_gb: float = 2
	):
	"""Main async function to generate the log file."""
	target_size = int(target_size_gb * 1024 * 1024 * 1024)
	stats = {"bytes_written": 0, "lines_written": 0}

	print(f"Generating {target_size_gb}GB nginx log file...")
	print(
	f"Using {CONCURRENT_BATCHES} concur generators, {NUM_THREADS} threads"
	)

	start_time = time.time()
	last_report_time = start_time
	last_bytes = 0

	with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
	with open(filename, "w", buffering=WRITE_BUFFER_SIZE) as f:
	pending_tasks = []

	while stats["bytes_written"] < target_size:
	# Keep queue filled with concurrent batch generations
	while (
	len(pending_tasks) < CONCURRENT_BATCHES
	and stats["bytes_written"] < target_size
	):
	task = asyncio.create_task(
	generate_batch(executor, BATCH_SIZE)
	)
	pending_tasks.append(task)

	# Wait for at least one batch to complete
	done, pending = await asyncio.wait(
	pending_tasks, return_when=asyncio.FIRST_COMPLETED
	)
	pending_tasks = list(pending)

	# Write completed batches
	for task in done:
	batch_data = await task
	await write_to_file(f, batch_data, stats)

	# Progress reporting
	current_time = time.time()
	if current_time - last_report_time >= 0.5:
	elapsed = current_time - start_time
	speed = (stats["bytes_written"] - last_bytes) / (
	(current_time - last_report_time) * 1024 * 1024
	)
	avg_speed = stats["bytes_written"] / (
	elapsed * 1024 * 1024
	)
	progress = (stats["bytes_written"] / target_size) * 100
	gb_written = stats["bytes_written"] / (1024**3)

	print(
	f"\rProgress: {progress:.1f}% ({gb_written:.2f}GB) \| "
	f"Spd: {speed:.0f} MB/s \| Avg: {avg_speed:.0f} MB/s \| "
	f"Lines: {stats['lines_written']:,}",
	end="",
	flush=True,
	)

	last_report_time = current_time
	last_bytes = stats["bytes_written"]

	# Process any remaining tasks
	if pending_tasks:
	remaining = await asyncio.gather(*pending_tasks)
	for batch_data in remaining:
	if stats["bytes_written"] < target_size:
	await write_to_file(f, batch_data, stats)

	elapsed = time.time() - start_time
	final_size = os.path.getsize(filename)

	print(f"\n✓ Completed in {elapsed:.2f} seconds")
	print(f"Final file size: {final_size / (1024**3):.2f}GB")
	print(f"Average speed: {(final_size / (1024**2)) / elapsed:.0f} MB/s")
	print(f"Total lines: {stats['lines_written']:,}")


	async def main():
	await generate_nginx_log("nginx_sample.log", 2)


	if __name__ == "__main__":
	asyncio.run(main())