Skip to content

Instantly share code, notes, and snippets.

@drbh
Last active March 20, 2025 18:51
Show Gist options
  • Save drbh/8300a39f57316a25497b0b4a71bf756b to your computer and use it in GitHub Desktop.
Save drbh/8300a39f57316a25497b0b4a71bf756b to your computer and use it in GitHub Desktop.
# /// script
# dependencies = [
# "requests<3",
# ]
# ///
import requests
import concurrent.futures
import time
from datetime import datetime
import copy
import os
ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "http://localhost:3000/generate")
HF_TOKEN = os.environ.get("HF_TOKEN")
# NUM_REQUESTS = 40
NUM_REQUESTS = 100_000
MAX_WORKERS = 20
FIXED_SEED = 1
headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
payload = {"inputs": "ok bip bop [MASK]", "parameters": {"seed": None}}
min_time = float("inf")
max_time = float("-inf")
def process_request(request_num):
global min_time, max_time
start_time = time.time()
payload_clone = copy.deepcopy(payload)
payload_clone["parameters"]["seed"] = FIXED_SEED
response = requests.post(ENDPOINT_URL, headers=headers, json=payload_clone)
elapsed = time.time() - start_time
status = response.status_code
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
elapsed_str = "{:.3f}".format(elapsed).zfill(7)
print(
f"[{request_num:06d}] [{timestamp}] [Status: {status}] [{elapsed_str}s] [Size: {len(response.text)} bytes] [Seed: {FIXED_SEED}]"
)
if status != 200:
print(f"[{request_num:06d}] ERROR: Received status {status}")
min_time = min(min_time, elapsed)
max_time = max(max_time, elapsed)
return status == 200
# Run benchmark
print(f"Starting {NUM_REQUESTS} requests to {ENDPOINT_URL} with {MAX_WORKERS} workers")
success_count = 0
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = [executor.submit(process_request, i + 1) for i in range(NUM_REQUESTS)]
for future in concurrent.futures.as_completed(futures):
if future.result():
success_count += 1
total_time = time.time() - start_time
print(f"\nSummary:")
print(f"Total requests: {NUM_REQUESTS}")
print(f"Successful: {success_count}")
print(f"Failed: {NUM_REQUESTS - success_count}")
print(f"Total time: {total_time:.2f} seconds")
print(f"Min time: {min_time:.3f} seconds")
print(f"Max time: {max_time:.3f} seconds")
print(f"Requests per second: {NUM_REQUESTS / total_time:.2f}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment