Chillee · February 6, 2025 10:05
diff --git a/peak_mm_perf.py b/peak_mm_perf.py
 import torch
 from triton.testing import do_bench
 import torch._inductor.config as config

 config.max_autotune_gemm_backends = "cutlass"

 torch.set_default_device('cuda')

 a = torch.randn(4224, 8192, dtype=torch.bfloat16)
 b = torch.randn(2048, 8192, dtype=torch.bfloat16).t()

 def get_flops(f):
    ms = do_bench(f, warmup=100, rep=10000)
    print(ms)
    print((1e3/ms) * a.shape[0] * a.shape[1] * b.shape[1] * 2 / 1e12, 'TF')

 f = lambda: torch.mm(a, b)
 f = torch.compile(f, mode="max-autotune-no-cudagraphs")

 # Also set `sudo nvidia-smi boost-slider --vboost 1`, which shifts more power from l2 cache to tensor cores
 get_flops(f)  # 780.1689058368037 TF
	import torch
	from triton.testing import do_bench
	import torch._inductor.config as config

	config.max_autotune_gemm_backends = "cutlass"

	torch.set_default_device('cuda')

	a = torch.randn(4224, 8192, dtype=torch.bfloat16)
	b = torch.randn(2048, 8192, dtype=torch.bfloat16).t()

	def get_flops(f):
	ms = do_bench(f, warmup=100, rep=10000)
	print(ms)
	print((1e3/ms) * a.shape[0] * a.shape[1] * b.shape[1] * 2 / 1e12, 'TF')

	f = lambda: torch.mm(a, b)
	f = torch.compile(f, mode="max-autotune-no-cudagraphs")

	# Also set `sudo nvidia-smi boost-slider --vboost 1`, which shifts more power from l2 cache to tensor cores
	get_flops(f) # 780.1689058368037 TF