Last active
March 20, 2025 18:51
-
-
Save drbh/8300a39f57316a25497b0b4a71bf756b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# dependencies = [ | |
# "requests<3", | |
# ] | |
# /// | |
import requests | |
import concurrent.futures | |
import time | |
from datetime import datetime | |
import copy | |
import os | |
ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "http://localhost:3000/generate") | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
# NUM_REQUESTS = 40 | |
NUM_REQUESTS = 100_000 | |
MAX_WORKERS = 20 | |
FIXED_SEED = 1 | |
headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"} | |
payload = {"inputs": "ok bip bop [MASK]", "parameters": {"seed": None}} | |
min_time = float("inf") | |
max_time = float("-inf") | |
def process_request(request_num): | |
global min_time, max_time | |
start_time = time.time() | |
payload_clone = copy.deepcopy(payload) | |
payload_clone["parameters"]["seed"] = FIXED_SEED | |
response = requests.post(ENDPOINT_URL, headers=headers, json=payload_clone) | |
elapsed = time.time() - start_time | |
status = response.status_code | |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] | |
elapsed_str = "{:.3f}".format(elapsed).zfill(7) | |
print( | |
f"[{request_num:06d}] [{timestamp}] [Status: {status}] [{elapsed_str}s] [Size: {len(response.text)} bytes] [Seed: {FIXED_SEED}]" | |
) | |
if status != 200: | |
print(f"[{request_num:06d}] ERROR: Received status {status}") | |
min_time = min(min_time, elapsed) | |
max_time = max(max_time, elapsed) | |
return status == 200 | |
# Run benchmark | |
print(f"Starting {NUM_REQUESTS} requests to {ENDPOINT_URL} with {MAX_WORKERS} workers") | |
success_count = 0 | |
start_time = time.time() | |
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: | |
futures = [executor.submit(process_request, i + 1) for i in range(NUM_REQUESTS)] | |
for future in concurrent.futures.as_completed(futures): | |
if future.result(): | |
success_count += 1 | |
total_time = time.time() - start_time | |
print(f"\nSummary:") | |
print(f"Total requests: {NUM_REQUESTS}") | |
print(f"Successful: {success_count}") | |
print(f"Failed: {NUM_REQUESTS - success_count}") | |
print(f"Total time: {total_time:.2f} seconds") | |
print(f"Min time: {min_time:.3f} seconds") | |
print(f"Max time: {max_time:.3f} seconds") | |
print(f"Requests per second: {NUM_REQUESTS / total_time:.2f}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment