Skip to content

Instantly share code, notes, and snippets.

@Chillee
Last active February 6, 2025 10:05
Show Gist options
  • Save Chillee/2ec89696db8b7ed1c24461159e325405 to your computer and use it in GitHub Desktop.
Save Chillee/2ec89696db8b7ed1c24461159e325405 to your computer and use it in GitHub Desktop.
H100 peak matmul FLOPS
import torch
from triton.testing import do_bench
import torch._inductor.config as config
config.max_autotune_gemm_backends = "cutlass"
torch.set_default_device('cuda')
a = torch.randn(4224, 8192, dtype=torch.bfloat16)
b = torch.randn(2048, 8192, dtype=torch.bfloat16).t()
def get_flops(f):
ms = do_bench(f, warmup=100, rep=10000)
print(ms)
print((1e3/ms) * a.shape[0] * a.shape[1] * b.shape[1] * 2 / 1e12, 'TF')
f = lambda: torch.mm(a, b)
f = torch.compile(f, mode="max-autotune-no-cudagraphs")
# Also set `sudo nvidia-smi boost-slider --vboost 1`, which shifts more power from l2 cache to tensor cores
get_flops(f) # 780.1689058368037 TF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment