sudo apt update
sudo apt install software-properties-common -y
import argparse | |
import math | |
# Helper function to pretty-print message sizes | |
def convert_flops(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) |
import torch | |
from torch.utils.flop_counter import FlopCounterMode | |
from triton.testing import do_bench | |
def get_flops_achieved(f): | |
flop_counter = FlopCounterMode(display=False) | |
with flop_counter: | |
f() | |
total_flops = flop_counter.get_total_flops() | |
ms_per_iter = do_bench(f) |
# Benchmark relative performance of torch.mm and torch.bmm with single batch | |
import torch | |
import time | |
def benchmark_fn(fn, args, warmup=5, cycles=300, use_kineto=False) -> float: | |
if use_kineto: | |
with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as p: | |
fn(*args) | |
return sum([e.cuda_time for e in p.key_averages()]) |
-------------------------------------------------------------------------- | |
# ofed_info -s | |
-------------------------------------------------------------------------- | |
Find Mellanox Adapter Type and Firmware/Driver version | |
ConnectX-4 card | |
# lspci | grep Mellanox | |
0a:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3] | |
# lspci -vv -s 0a:00.0 | grep "Part number" -A 3 | |
# lspci | grep Mellanox | awk '{print $1}' | xargs -i -r mstvpd {} |
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache | |
import torch | |
from typing import Optional | |
device = "cuda" | |
# Copied from the gpt-fast repo | |
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization | |
q = torch.empty_like(probs_sort).exponential_(1) | |
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) |
import os | |
import asyncio | |
import subprocess | |
import time | |
from typing import List, Dict | |
import torch | |
from openai import AsyncOpenAI | |
from tqdm.asyncio import tqdm | |
import logging |
This doc servers as a quick reference for the _scaled_mm
API and how it has changed overtime for each major version of PyTorch.
NOTE The leading underscore is intended here and we make no current FC/BC guarantees on this API. That being said it is currently the only OP that has native support for FP8 matmuls within the PyTorch Libary. We are planning to make an official Public api for this. Until then this is subject to change but you can use this doc as a reference.