cloneofsimo · October 6, 2024 10:59
diff --git a/bf16_matmul_reduction.py b/bf16_matmul_reduction.py
 import torch
 import time

 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False

 @torch.no_grad()
 def benchmark_gemm(m, k, n, dtype=torch.bfloat16, allow_bf16_reduce=True):
    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = allow_bf16_reduce
    
    A = torch.randn(m, k, device='cuda', dtype=dtype)
    B = torch.randn(k, n, device='cuda', dtype=dtype)
    
    torch.cuda.synchronize()
    for _ in range(10):
        torch.matmul(A, B)
    
    torch.cuda.synchronize()
    start_time = time.time()
    for _ in range(200):
        C = torch.matmul(A, B)
    torch.cuda.synchronize()
    elapsed_time = (time.time() - start_time) / 200
    
    return elapsed_time * 1e6

 test_cases = [
    (4096, 4096, 4096),
    (4096, 5120, 4096),
    (4096, 6144, 4096),
    (4096, 8192, 4096),
    (4096, 10240, 4096),
    (4096, 16384, 4096)
 ]

 results = []
 for allow_bf16_reduce in [True, False]:
    result = []
    for m, k, n in test_cases:
        time_taken = benchmark_gemm(m, k, n, allow_bf16_reduce=allow_bf16_reduce)
        result.append((m, k, n, time_taken))
    results.append(result)

 for i, allow_bf16_reduce in enumerate([True, False]):
    print(f"Results with allow_bf16_reduced_precision_reduction = {allow_bf16_reduce}")
    for m, k, n, time_taken in results[i]:
        print(f"({m}, {k}, {n}) - {time_taken:.2f} µs")
	import torch
	import time

	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False

	@torch.no_grad()
	def benchmark_gemm(m, k, n, dtype=torch.bfloat16, allow_bf16_reduce=True):
	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = allow_bf16_reduce

	A = torch.randn(m, k, device='cuda', dtype=dtype)
	B = torch.randn(k, n, device='cuda', dtype=dtype)

	torch.cuda.synchronize()
	for _ in range(10):
	torch.matmul(A, B)

	torch.cuda.synchronize()
	start_time = time.time()
	for _ in range(200):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()
	elapsed_time = (time.time() - start_time) / 200

	return elapsed_time * 1e6

	test_cases = [
	(4096, 4096, 4096),
	(4096, 5120, 4096),
	(4096, 6144, 4096),
	(4096, 8192, 4096),
	(4096, 10240, 4096),
	(4096, 16384, 4096)
	]

	results = []
	for allow_bf16_reduce in [True, False]:
	result = []
	for m, k, n in test_cases:
	time_taken = benchmark_gemm(m, k, n, allow_bf16_reduce=allow_bf16_reduce)
	result.append((m, k, n, time_taken))
	results.append(result)

	for i, allow_bf16_reduce in enumerate([True, False]):
	print(f"Results with allow_bf16_reduced_precision_reduction = {allow_bf16_reduce}")
	for m, k, n, time_taken in results[i]:
	print(f"({m}, {k}, {n}) - {time_taken:.2f} µs")