sudo apt update
sudo apt install software-properties-common -y
import torch | |
from torch.utils.flop_counter import FlopCounterMode | |
from triton.testing import do_bench | |
def get_flops_achieved(f): | |
flop_counter = FlopCounterMode(display=False) | |
with flop_counter: | |
f() | |
total_flops = flop_counter.get_total_flops() | |
ms_per_iter = do_bench(f) |
# Benchmark relative performance of torch.mm and torch.bmm with single batch | |
import torch | |
import time | |
def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float: | |
if use_kineto: | |
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p: | |
fn(*args) | |
return sum([e.cuda_time for e in p.key_averages()]) |
-------------------------------------------------------------------------- | |
# ofed_info -s | |
-------------------------------------------------------------------------- | |
Find Mellanox Adapter Type and Firmware/Driver version | |
ConnectX-4 card | |
# lspci | grep Mellanox | |
0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3] | |
# lspci -vv -s 0a:00.0 | grep "Part number" -A 3 | |
# lspci | grep Mellanox | awk '{print $1}' | xargs -i -r mstvpd {} |
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache | |
import torch | |
from typing import Optional | |
device = "cuda" | |
# Copied from the gpt-fast repo | |
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization | |
q = torch.empty_like(probs_sort).exponential_(1) | |
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) |
import os | |
import asyncio | |
import subprocess | |
import time | |
from typing import List, Dict | |
import torch | |
from openai import AsyncOpenAI | |
from tqdm.asyncio import tqdm | |
import logging |
git clone https://github.com/vllm-project/vllm | |
cd vllm/benchmarks | |
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
mkdir results | |
python benchmark_serving.py \ | |
--backend vllm \ | |
--model meta-llama/Meta-Llama-3-8B-Instruct \ | |
--dataset-name sharegpt \ | |
--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ | |
--port 9999 \ |
This doc servers as a quick reference for the _scaled_mm
API and how it has changed overtime for each major version of PyTorch.
NOTE The leading underscore is intended here and we make no current FC/BC guarantees on this API. That being said it is currently the only OP that has native support for FP8 matmuls within the PyTorch Libary. We are planning to make an official Public api for this. Until then this is subject to change but you can use this doc as a reference.
import torch | |
torch.manual_seed(42) | |
def torch_sdpa(query, key, value): | |
out, lse, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask = ( | |
torch.ops.aten._scaled_dot_product_cudnn_attention( | |
query=query, | |
key=key, |