Skip to content

Instantly share code, notes, and snippets.

@fffej
Created March 2, 2025 13:20
Show Gist options
  • Save fffej/39516c2c7d035b2f8d867534e22bfda3 to your computer and use it in GitHub Desktop.
Save fffej/39516c2c7d035b2f8d867534e22bfda3 to your computer and use it in GitHub Desktop.
code for a blog post (ably assisted with AI, but mistakes are mine!)
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
def simulate_batch(N, T, p, batch_size):
"""
Simulate processing N tasks in batches with testing.
Parameters:
N: Total number of tasks
T: Time for testing a batch
p: Probability a single task passes its test
batch_size: Maximum tasks in a batch
Returns:
Total time taken
"""
total_time = 0
tasks_remaining = N
while tasks_remaining > 0:
current_batch = min(batch_size, tasks_remaining)
# Process batch until it passes testing
while True:
# Time to process tasks + test
total_time += current_batch + T
# Check if batch passes (all tasks must pass)
if random.random() < (p ** current_batch):
break
tasks_remaining -= current_batch
return total_time
def expected_time_per_task(T, p, batch_size, max_value=1000):
"""
Calculate expected time per task for a given batch size.
Parameters:
T: Testing time
p: Success probability
batch_size: Batch size
Returns:
Expected time per task
"""
# Handle edge cases
if p >= 0.9999:
return 1 + T/batch_size
try:
# Expected attempts until success: 1/(p^batch_size)
expected_attempts = min(np.exp(-batch_size * np.log(p)), max_value)
# Expected time per task
result = ((batch_size + T) * expected_attempts) / batch_size
return min(result, max_value)
except:
return max_value
def find_optimal_batch_size(T, p, max_batch_size=20):
"""Find batch size that minimizes expected time per task."""
min_time = float('inf')
optimal_batch = 1
for batch_size in range(1, max_batch_size + 1):
time = expected_time_per_task(T, p, batch_size)
if time < min_time:
min_time = time
optimal_batch = batch_size
return optimal_batch, min_time
def theoretical_optimal_batch_size(T, p):
"""Calculate theoretical optimal batch size."""
if p == 1:
return float('inf') # With perfect reliability, larger batches are always better
optimal = np.sqrt(T / (-np.log(p)))
return max(1, int(round(optimal)))
def create_batch_size_analysis_plot(T_samples, p_samples, max_batch_size=20):
"""Create plot showing effect of batch size on expected time."""
batch_sizes = np.arange(1, max_batch_size + 1)
plt.figure(figsize=(10, 6))
line_styles = ['-', '--', '-.', ':']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
for i, T in enumerate(T_samples):
for j, p in enumerate(p_samples):
times = [expected_time_per_task(T, p, b) for b in batch_sizes]
plt.plot(batch_sizes, times,
marker='o', markersize=4,
linestyle=line_styles[i % len(line_styles)],
color=colors[j % len(colors)],
label=f'T={T}, p={p}')
plt.xlabel('Batch Size')
plt.ylabel('Expected Time per Task')
plt.title('Effect of Batch Size on Expected Time')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('batch_size_effect.png', dpi=300)
def create_optimal_batch_size_heatmap(T_range, p_range):
"""Create heatmap showing optimal batch size for different T and p values."""
T_values = np.linspace(T_range[0], T_range[1], 40)
p_values = np.linspace(p_range[0], p_range[1], 40)
T_grid, p_grid = np.meshgrid(T_values, p_values)
optimal_batch = np.zeros_like(T_grid)
for i in range(len(p_values)):
for j in range(len(T_values)):
optimal_batch[i, j], _ = find_optimal_batch_size(
T_grid[i, j], p_grid[i, j])
plt.figure(figsize=(10, 8))
sns.heatmap(optimal_batch,
xticklabels=np.round(T_values[::5], 1),
yticklabels=np.round(p_values[::5], 3),
cmap='viridis',
cbar_kws={'label': 'Optimal Batch Size'})
plt.xlabel('Test Time (T)')
plt.ylabel('Probability of Success (p)')
plt.title('Optimal Batch Size for Minimizing Expected Time per Task')
plt.savefig('optimal_batch_size.png', dpi=300)
def run_simulations_and_plot(N, T, p, max_batch_size, num_simulations=100):
"""Run simulations for different batch sizes and plot the results."""
batch_sizes = list(range(1, min(max_batch_size, N) + 1))
# Run simulations
data = []
for batch_size in batch_sizes:
batch_times = [simulate_batch(N, T, p, batch_size) for _ in range(num_simulations)]
for time in batch_times:
data.append({'Batch Size': batch_size, 'Total Time': time})
df = pd.DataFrame(data)
# Calculate statistics
means = df.groupby('Batch Size')['Total Time'].mean()
errors = df.groupby('Batch Size')['Total Time'].sem() * 1.96 # 95% confidence interval
# Create plot
plt.figure(figsize=(10, 6))
plt.errorbar(x=means.index, y=means.values, yerr=errors,
fmt='ro-', markersize=6, linewidth=1.5, capsize=4, label='Mean ± 95% CI')
# Set x-axis labels
step = max(1, max_batch_size // 10)
tick_positions = list(range(1, max_batch_size + 1, step))
if max_batch_size not in tick_positions:
tick_positions.append(max_batch_size)
plt.xticks(tick_positions)
# Add labels and title
plt.xlabel('Batch Size')
plt.ylabel('Total Time')
plt.title(f'Distribution of Total Time vs Batch Size (N={N}, T={T}, p={p})')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('simulation_results.png')
if __name__ == "__main__":
# Create directory for results
import os
os.makedirs('results', exist_ok=True)
# Example parameters
N = 100 # Total tasks
T = 10 # Testing time
p = 0.95 # Success probability
# Show optimal batch size calculation
optimal = theoretical_optimal_batch_size(T, p)
print(f"Theoretical optimal batch size for T={T}, p={p}: {optimal}")
# Create batch size analysis plot
T_samples = [1, 5, 10, 20]
p_samples = [0.8, 0.9, 0.95, 0.99]
create_batch_size_analysis_plot(T_samples, p_samples)
# Create optimal batch size heatmap
create_optimal_batch_size_heatmap([1, 20], [0.8, 0.999])
# Run simulations
run_simulations_and_plot(N, T, p, 20)
print("Analysis completed. Results saved in the current directory.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment