sudo apt update
sudo apt install software-properties-common -y
git clone https://github.com/vllm-project/vllm | |
cd vllm/benchmarks | |
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
mkdir results | |
python benchmark_serving.py \ | |
--backend vllm \ | |
--model meta-llama/Meta-Llama-3-8B-Instruct \ | |
--dataset-name sharegpt \ | |
--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ | |
--port 9999 \ |
import os | |
import asyncio | |
import subprocess | |
import time | |
from typing import List, Dict | |
import torch | |
from openai import AsyncOpenAI | |
from tqdm.asyncio import tqdm | |
import logging |
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache | |
import torch | |
from typing import Optional | |
device = "cuda" | |
# Copied from the gpt-fast repo | |
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization | |
q = torch.empty_like(probs_sort).exponential_(1) | |
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) |
-------------------------------------------------------------------------- | |
# ofed_info -s | |
-------------------------------------------------------------------------- | |
Find Mellanox Adapter Type and Firmware/Driver version | |
ConnectX-4 card | |
# lspci | grep Mellanox | |
0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3] | |
# lspci -vv -s 0a:00.0 | grep "Part number" -A 3 | |
# lspci | grep Mellanox | awk '{print $1}' | xargs -i -r mstvpd {} |
# Benchmark relative performance of torch.mm and torch.bmm with single batch | |
import torch | |
import time | |
def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float: | |
if use_kineto: | |
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p: | |
fn(*args) | |
return sum([e.cuda_time for e in p.key_averages()]) |
import torch | |
from torch.utils.flop_counter import FlopCounterMode | |
from triton.testing import do_bench | |
def get_flops_achieved(f): | |
flop_counter = FlopCounterMode(display=False) | |
with flop_counter: | |
f() | |
total_flops = flop_counter.get_total_flops() | |
ms_per_iter = do_bench(f) |
import argparse | |
import math | |
# Helper function to pretty-print message sizes | |
def convert_flops(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) |
import argparse | |
import math | |
# Helper function to pretty-print message sizes | |
def convert_params(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) |