Skip to content

Instantly share code, notes, and snippets.

@tiandiao123
Last active October 17, 2023 03:45
Show Gist options
  • Save tiandiao123/f8910931352cf71476fea2329e1439fa to your computer and use it in GitHub Desktop.
Save tiandiao123/f8910931352cf71476fea2329e1439fa to your computer and use it in GitHub Desktop.
vllm_benchmark.py
from vllm import LLM, SamplingParams
import torch
from torch import distributed as dist
import time
from tqdm import tqdm
import numpy as np
# # Create an LLM.
llm = LLM(
model="/home/lclcq/share/llama-7b",
# model="/home/lccd/share/model_data/models--bigscience--bloom-560m/snapshots/4f42c91d806a19ae1a46af6c3fb5f4990d884cd6",
# model="facebook/opt-125m",
tensor_parallel_size=1,
# max_num_seqs=1,
# max_num_batched_tokens=2048,
gpu_memory_utilization=0.95,
trust_remote_code=True)
def run_to_completion(sampling_params, dummy_prompt_token_ids, profile: bool = False):
if profile:
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.synchronize()
start_time = time.time()
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
torch.cuda.synchronize()
end_time = time.time()
latency = end_time - start_time
if profile:
torch.cuda.cudart().cudaProfilerStop()
return latency
batch = 8
input_len = 1024
out_len = 1
sampling_params = SamplingParams(
n=1,
temperature=1.0,
top_p=1.0,
use_beam_search=False,
ignore_eos=True,
max_tokens=out_len,
)
dummy_prompt_token_ids = []
dummy_prompt_token_ids_s = torch.randint(1, 10240, (batch, input_len))
for t in range(batch):
a = []
for i in range(input_len):
a.append(i)
dummy_prompt_token_ids.append(a) # print(dummy_prompt_token_ids)
# print("Warming up...")
for i in range(2):
run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False)
# Benchmark.
latencies = []
for _ in range(5): #tqdm(range(5), desc="Profiling iterations"):
latencies.append(run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False))
prefill_avg_latency = np.mean(latencies)
print(f'prefill latency: {prefill_avg_latency*1000 / out_len} ms')
# print(f'Avg throughput: {out_len/avg_latency} tokens/seconds')
out_len = 128
sampling_params = SamplingParams(
n=1,
temperature=1.0,
top_p=1.0,
use_beam_search=False,
ignore_eos=True,
max_tokens=out_len,
)
dummy_prompt_token_ids = []
dummy_prompt_token_ids_s = torch.randint(1, 10240, (batch, input_len))
for t in range(batch):
a = []
for i in range(input_len):
a.append(i)
dummy_prompt_token_ids.append(a)
latencies = []
for _ in range(5): #tqdm(range(5), desc="Profiling iterations"):
latencies.append(run_to_completion(sampling_params, dummy_prompt_token_ids, profile=False))
avg_latency = np.mean(latencies)
# print(f'Avg latency: {avg_latency*1000 / out_len} ms')
print(f'Decode throughput: {batch*out_len/(avg_latency - prefill_avg_latency)} tokens/s')
print(f'total throughput: {batch*out_len/avg_latency} tokens/s')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment