Stas Bekman stas00

Summary

This doc servers as a quick reference for the _scaled_mm API and how it has changed overtime for each major version of PyTorch.

NOTE The leading underscore is intended here and we make no current FC/BC guarantees on this API. That being said it is currently the only OP that has native support for FP8 matmuls within the PyTorch Libary. We are planning to make an official Public api for this. Until then this is subject to change but you can use this doc as a reference.

Prerequisite

sudo apt update
sudo apt install software-properties-common -y

	import torch

	torch.manual_seed(42)


	def torch_sdpa(query, key, value):
	out, lse, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask = (
	torch.ops.aten._scaled_dot_product_cudnn_attention(
	query=query,
	key=key,

	git clone https://github.com/vllm-project/vllm
	cd vllm/benchmarks
	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
	mkdir results
	python benchmark_serving.py \
	--backend vllm \
	--model meta-llama/Meta-Llama-3-8B-Instruct \
	--dataset-name sharegpt \
	--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
	--port 9999 \

	import os
	import asyncio
	import subprocess
	import time
	from typing import List, Dict
	import torch
	from openai import AsyncOpenAI
	from tqdm.asyncio import tqdm
	import logging

	from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
	import torch
	from typing import Optional
	device = "cuda"

	# Copied from the gpt-fast repo
	def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
	q = torch.empty_like(probs_sort).exponential_(1)
	return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

	--------------------------------------------------------------------------
	# ofed_info -s
	--------------------------------------------------------------------------
	Find Mellanox Adapter Type and Firmware/Driver version
	ConnectX-4 card

	# lspci \| grep Mellanox
	0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
	# lspci -vv -s 0a:00.0 \| grep "Part number" -A 3
	# lspci \| grep Mellanox \| awk '{print $1}' \| xargs -i -r mstvpd {}

	# Benchmark relative performance of torch.mm and torch.bmm with single batch
	import torch
	import time


	def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float:
	if use_kineto:
	with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p:
	fn(*args)
	return sum([e.cuda_time for e in p.key_averages()])

	import torch
	from torch.utils.flop_counter import FlopCounterMode
	from triton.testing import do_bench

	def get_flops_achieved(f):
	flop_counter = FlopCounterMode(display=False)
	with flop_counter:
	f()
	total_flops = flop_counter.get_total_flops()
	ms_per_iter = do_bench(f)

Stas Bekman stas00

Summary

Prerequisite

Add custom APT repository