Skip to content

Instantly share code, notes, and snippets.

@cloneofsimo
Created October 6, 2024 10:59
Show Gist options
  • Save cloneofsimo/95149055a0843dfcf77b65f2933a1b33 to your computer and use it in GitHub Desktop.
Save cloneofsimo/95149055a0843dfcf77b65f2933a1b33 to your computer and use it in GitHub Desktop.
wtf man
import torch
import time
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@torch.no_grad()
def benchmark_gemm(m, k, n, dtype=torch.bfloat16, allow_bf16_reduce=True):
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = allow_bf16_reduce
A = torch.randn(m, k, device='cuda', dtype=dtype)
B = torch.randn(k, n, device='cuda', dtype=dtype)
torch.cuda.synchronize()
for _ in range(10):
torch.matmul(A, B)
torch.cuda.synchronize()
start_time = time.time()
for _ in range(200):
C = torch.matmul(A, B)
torch.cuda.synchronize()
elapsed_time = (time.time() - start_time) / 200
return elapsed_time * 1e6
test_cases = [
(4096, 4096, 4096),
(4096, 5120, 4096),
(4096, 6144, 4096),
(4096, 8192, 4096),
(4096, 10240, 4096),
(4096, 16384, 4096)
]
results = []
for allow_bf16_reduce in [True, False]:
result = []
for m, k, n in test_cases:
time_taken = benchmark_gemm(m, k, n, allow_bf16_reduce=allow_bf16_reduce)
result.append((m, k, n, time_taken))
results.append(result)
for i, allow_bf16_reduce in enumerate([True, False]):
print(f"Results with allow_bf16_reduced_precision_reduction = {allow_bf16_reduce}")
for m, k, n, time_taken in results[i]:
print(f"({m}, {k}, {n}) - {time_taken:.2f} µs")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment