Skip to content

Instantly share code, notes, and snippets.

View stas00's full-sized avatar

Stas Bekman stas00

View GitHub Profile
@stas00
stas00 / gist:060bffc245244532231a7bb29003cd56
Created October 12, 2024 02:08
easy scalable inference benchmarking with aiohttp client (via vllm)
git clone https://github.com/vllm-project/vllm
cd vllm/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
mkdir results
python benchmark_serving.py \
--backend vllm \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name sharegpt \
--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
--port 9999 \
import os
import asyncio
import subprocess
import time
from typing import List, Dict
import torch
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm
import logging
@stas00
stas00 / README.md
Created September 13, 2024 20:15 — forked from rutcreate/README.md
Install Python 3.10.x on Ubuntu 20.04

Prerequisite

sudo apt update
sudo apt install software-properties-common -y

Add custom APT repository

@stas00
stas00 / static_kv_cache.py
Created March 2, 2024 02:56 — forked from ArthurZucker/static_kv_cache.py
simple static kv cache script
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
import torch
from typing import Optional
device = "cuda"
# Copied from the gpt-fast repo
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
q = torch.empty_like(probs_sort).exponential_(1)
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
@stas00
stas00 / Mellanox OFED cheat sheet
Created March 1, 2024 02:40 — forked from githubfoam/Mellanox OFED cheat sheet
Mellanox OFED cheat sheet
--------------------------------------------------------------------------
# ofed_info -s
--------------------------------------------------------------------------
Find Mellanox Adapter Type and Firmware/Driver version
ConnectX-4 card
# lspci | grep Mellanox
0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
# lspci -vv -s 0a:00.0 | grep "Part number" -A 3
# lspci | grep Mellanox | awk '{print $1}' | xargs -i -r mstvpd {}
@stas00
stas00 / mm_bmm-perf.py
Created February 16, 2024 00:27 — forked from malfet/mm_bmm-perf.py
Measure performance difference of `torch.mm` vs `torch.bmm`
# Benchmark relative performance of torch.mm and torch.bmm with single batch
import torch
import time
def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float:
if use_kineto:
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p:
fn(*args)
return sum([e.cuda_time for e in p.key_averages()])
@stas00
stas00 / mfu_compute.py
Created January 5, 2024 23:28 — forked from Chillee/mfu_compute.py
Compute Flop Utilization in PyTorch
import torch
from torch.utils.flop_counter import FlopCounterMode
from triton.testing import do_bench
def get_flops_achieved(f):
flop_counter = FlopCounterMode(display=False)
with flop_counter:
f()
total_flops = flop_counter.get_total_flops()
ms_per_iter = do_bench(f)
@stas00
stas00 / vram.ipynb
Created December 18, 2023 03:12
memory allocations breakdown
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@stas00
stas00 / calc_transformer_flops.py
Created November 22, 2023 01:16 — forked from Quentin-Anthony/calc_transformer_flops.py
Transformer FLOPs with Dense/MoE
import argparse
import math
# Helper function to pretty-print message sizes
def convert_flops(params):
if params == 0:
return "0"
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
i = int(math.floor(math.log(params, 1000)))
p = math.pow(1000, i)
@stas00
stas00 / calc_transformer_params.py
Created November 22, 2023 01:15 — forked from Quentin-Anthony/calc_transformer_params.py
Transformer Parameter Count
import argparse
import math
# Helper function to pretty-print message sizes
def convert_params(params):
if params == 0:
return "0"
size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
i = int(math.floor(math.log(params, 1000)))
p = math.pow(1000, i)