Created
March 2, 2025 13:20
-
-
Save fffej/39516c2c7d035b2f8d867534e22bfda3 to your computer and use it in GitHub Desktop.
code for a blog post (ably assisted with AI, but mistakes are mine!)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import random | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import seaborn as sns | |
| from mpl_toolkits.mplot3d import Axes3D | |
| from matplotlib import cm | |
| def simulate_batch(N, T, p, batch_size): | |
| """ | |
| Simulate processing N tasks in batches with testing. | |
| Parameters: | |
| N: Total number of tasks | |
| T: Time for testing a batch | |
| p: Probability a single task passes its test | |
| batch_size: Maximum tasks in a batch | |
| Returns: | |
| Total time taken | |
| """ | |
| total_time = 0 | |
| tasks_remaining = N | |
| while tasks_remaining > 0: | |
| current_batch = min(batch_size, tasks_remaining) | |
| # Process batch until it passes testing | |
| while True: | |
| # Time to process tasks + test | |
| total_time += current_batch + T | |
| # Check if batch passes (all tasks must pass) | |
| if random.random() < (p ** current_batch): | |
| break | |
| tasks_remaining -= current_batch | |
| return total_time | |
| def expected_time_per_task(T, p, batch_size, max_value=1000): | |
| """ | |
| Calculate expected time per task for a given batch size. | |
| Parameters: | |
| T: Testing time | |
| p: Success probability | |
| batch_size: Batch size | |
| Returns: | |
| Expected time per task | |
| """ | |
| # Handle edge cases | |
| if p >= 0.9999: | |
| return 1 + T/batch_size | |
| try: | |
| # Expected attempts until success: 1/(p^batch_size) | |
| expected_attempts = min(np.exp(-batch_size * np.log(p)), max_value) | |
| # Expected time per task | |
| result = ((batch_size + T) * expected_attempts) / batch_size | |
| return min(result, max_value) | |
| except: | |
| return max_value | |
| def find_optimal_batch_size(T, p, max_batch_size=20): | |
| """Find batch size that minimizes expected time per task.""" | |
| min_time = float('inf') | |
| optimal_batch = 1 | |
| for batch_size in range(1, max_batch_size + 1): | |
| time = expected_time_per_task(T, p, batch_size) | |
| if time < min_time: | |
| min_time = time | |
| optimal_batch = batch_size | |
| return optimal_batch, min_time | |
| def theoretical_optimal_batch_size(T, p): | |
| """Calculate theoretical optimal batch size.""" | |
| if p == 1: | |
| return float('inf') # With perfect reliability, larger batches are always better | |
| optimal = np.sqrt(T / (-np.log(p))) | |
| return max(1, int(round(optimal))) | |
| def create_batch_size_analysis_plot(T_samples, p_samples, max_batch_size=20): | |
| """Create plot showing effect of batch size on expected time.""" | |
| batch_sizes = np.arange(1, max_batch_size + 1) | |
| plt.figure(figsize=(10, 6)) | |
| line_styles = ['-', '--', '-.', ':'] | |
| colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'] | |
| for i, T in enumerate(T_samples): | |
| for j, p in enumerate(p_samples): | |
| times = [expected_time_per_task(T, p, b) for b in batch_sizes] | |
| plt.plot(batch_sizes, times, | |
| marker='o', markersize=4, | |
| linestyle=line_styles[i % len(line_styles)], | |
| color=colors[j % len(colors)], | |
| label=f'T={T}, p={p}') | |
| plt.xlabel('Batch Size') | |
| plt.ylabel('Expected Time per Task') | |
| plt.title('Effect of Batch Size on Expected Time') | |
| plt.grid(True) | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.savefig('batch_size_effect.png', dpi=300) | |
| def create_optimal_batch_size_heatmap(T_range, p_range): | |
| """Create heatmap showing optimal batch size for different T and p values.""" | |
| T_values = np.linspace(T_range[0], T_range[1], 40) | |
| p_values = np.linspace(p_range[0], p_range[1], 40) | |
| T_grid, p_grid = np.meshgrid(T_values, p_values) | |
| optimal_batch = np.zeros_like(T_grid) | |
| for i in range(len(p_values)): | |
| for j in range(len(T_values)): | |
| optimal_batch[i, j], _ = find_optimal_batch_size( | |
| T_grid[i, j], p_grid[i, j]) | |
| plt.figure(figsize=(10, 8)) | |
| sns.heatmap(optimal_batch, | |
| xticklabels=np.round(T_values[::5], 1), | |
| yticklabels=np.round(p_values[::5], 3), | |
| cmap='viridis', | |
| cbar_kws={'label': 'Optimal Batch Size'}) | |
| plt.xlabel('Test Time (T)') | |
| plt.ylabel('Probability of Success (p)') | |
| plt.title('Optimal Batch Size for Minimizing Expected Time per Task') | |
| plt.savefig('optimal_batch_size.png', dpi=300) | |
| def run_simulations_and_plot(N, T, p, max_batch_size, num_simulations=100): | |
| """Run simulations for different batch sizes and plot the results.""" | |
| batch_sizes = list(range(1, min(max_batch_size, N) + 1)) | |
| # Run simulations | |
| data = [] | |
| for batch_size in batch_sizes: | |
| batch_times = [simulate_batch(N, T, p, batch_size) for _ in range(num_simulations)] | |
| for time in batch_times: | |
| data.append({'Batch Size': batch_size, 'Total Time': time}) | |
| df = pd.DataFrame(data) | |
| # Calculate statistics | |
| means = df.groupby('Batch Size')['Total Time'].mean() | |
| errors = df.groupby('Batch Size')['Total Time'].sem() * 1.96 # 95% confidence interval | |
| # Create plot | |
| plt.figure(figsize=(10, 6)) | |
| plt.errorbar(x=means.index, y=means.values, yerr=errors, | |
| fmt='ro-', markersize=6, linewidth=1.5, capsize=4, label='Mean ± 95% CI') | |
| # Set x-axis labels | |
| step = max(1, max_batch_size // 10) | |
| tick_positions = list(range(1, max_batch_size + 1, step)) | |
| if max_batch_size not in tick_positions: | |
| tick_positions.append(max_batch_size) | |
| plt.xticks(tick_positions) | |
| # Add labels and title | |
| plt.xlabel('Batch Size') | |
| plt.ylabel('Total Time') | |
| plt.title(f'Distribution of Total Time vs Batch Size (N={N}, T={T}, p={p})') | |
| plt.legend() | |
| plt.grid(True, linestyle='--', alpha=0.7) | |
| plt.tight_layout() | |
| plt.savefig('simulation_results.png') | |
| if __name__ == "__main__": | |
| # Create directory for results | |
| import os | |
| os.makedirs('results', exist_ok=True) | |
| # Example parameters | |
| N = 100 # Total tasks | |
| T = 10 # Testing time | |
| p = 0.95 # Success probability | |
| # Show optimal batch size calculation | |
| optimal = theoretical_optimal_batch_size(T, p) | |
| print(f"Theoretical optimal batch size for T={T}, p={p}: {optimal}") | |
| # Create batch size analysis plot | |
| T_samples = [1, 5, 10, 20] | |
| p_samples = [0.8, 0.9, 0.95, 0.99] | |
| create_batch_size_analysis_plot(T_samples, p_samples) | |
| # Create optimal batch size heatmap | |
| create_optimal_batch_size_heatmap([1, 20], [0.8, 0.999]) | |
| # Run simulations | |
| run_simulations_and_plot(N, T, p, 20) | |
| print("Analysis completed. Results saved in the current directory.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment