Skip to content

Instantly share code, notes, and snippets.

@chn-lee-yumi
Last active May 27, 2025 04:15
Show Gist options
  • Select an option

  • Save chn-lee-yumi/e0a4e515e4f7cf1c135bb8df048140fc to your computer and use it in GitHub Desktop.

Select an option

Save chn-lee-yumi/e0a4e515e4f7cf1c135bb8df048140fc to your computer and use it in GitHub Desktop.
Torch Performance Test
import importlib
import time
import torch
def benchmark_dtype(dtype, device, size, repeat):
try:
# 创建两个张量
a = torch.randn((size, size), dtype=dtype, device=device)
b = torch.randn((size, size), dtype=dtype, device=device)
# CUDA 加速时使用事件进行计时
if device.type == 'cuda':
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(repeat):
c = torch.matmul(a, b)
torch.cuda.synchronize()
end.record()
elapsed_time = start.elapsed_time(end) / repeat # 毫秒
else:
if device.type == 'xpu':
torch.xpu.synchronize()
# CPU 使用 time.perf_counter()
start_time = time.perf_counter()
for _ in range(repeat):
c = torch.matmul(a, b)
if device.type == 'xpu':
torch.xpu.synchronize()
end_time = time.perf_counter()
elapsed_time = (end_time - start_time) * 1000 / repeat # 毫秒
return elapsed_time
except Exception as e:
return None
def test_all_dtypes(device, size=1024, repeat=100):
dtypes = [
torch.float16,
# torch.bfloat16,
torch.float32,
# torch.float64,
# torch.complex64,
# torch.complex128,
]
results = {}
for dtype in dtypes:
time_ms = benchmark_dtype(dtype, device, size, repeat)
if time_ms is not None:
flops = 2 * size * size * size
gflops = flops / (time_ms / 1000) / 1e9
results[str(dtype)] = f"{time_ms:.2f} ms, {gflops:.2f} GFlops"
else:
results[str(dtype)] = "Not supported"
return results
device_list = ["cpu", "cuda", "mps", "xpu"] # 推理设备,可选cpu、cuda、mps、xpu
if importlib.util.find_spec("torch_directml") is not None: # 如果支持DirectML,则加入DirectML设备
import torch_directml
if torch_directml.device_count() > 0:
device_list.append(torch_directml.device())
for device in device_list:
# print(f"Testing device: {device}")
try:
device = torch.device(device)
t = torch.tensor([1], device=device)
if device.type == "cpu":
_ = test_all_dtypes(device, size=512, repeat=2) # 第一次是预热
results = test_all_dtypes(device, size=512, repeat=100)
else: # size 可能需要调整到合适的大小,太小测不出满血性能
_ = test_all_dtypes(device, size=2048, repeat=10) # 第一次是预热
results = test_all_dtypes(device, size=2048, repeat=1000)
print(f"Computation time per matmul on {device}:")
for dtype, time_result in results.items():
print(f"{dtype:>16}: {time_result}")
except (AssertionError, RuntimeError):
# print(f"Not supported device: {device}")
continue
"""
===== 2018 MacBook Pro 15 (i7-8850H + Radeon Pro 555X) =====
Testing device: cpu
Computation time per matmul on cpu:
torch.float16: 2319.52 ms, 0.93 GFlops
torch.bfloat16: 1845.87 ms, 1.16 GFlops
torch.float32: 5.80 ms, 370.48 GFlops
torch.float64: 11.85 ms, 181.22 GFlops
torch.complex64: 19.99 ms, 107.45 GFlops
torch.complex128: 40.49 ms, 53.03 GFlops
Testing device: mps
Computation time per matmul on mps:
torch.float16: 20.04 ms, 857.37 GFlops
torch.bfloat16: Not supported
torch.float32: 27.42 ms, 626.51 GFlops
torch.float64: Not supported
torch.complex64: Not supported
torch.complex128: Not supported
===== X5650 + 2080Ti =====
Testing device: cpu
Computation time per matmul on cpu:
torch.float16: 9978.52 ms, 0.22 GFlops
torch.bfloat16: 2999.50 ms, 0.72 GFlops
torch.float32: 35.96 ms, 59.71 GFlops
torch.float64: 73.87 ms, 29.07 GFlops
torch.complex64: 136.10 ms, 15.78 GFlops
torch.complex128: 220.86 ms, 9.72 GFlops
Testing device: cuda
Computation time per matmul on cuda:
torch.float16: 0.40 ms, 42776.64 GFlops
torch.bfloat16: 2.91 ms, 5894.89 GFlops
torch.float32: 1.57 ms, 10915.13 GFlops
torch.float64: 49.27 ms, 348.65 GFlops
torch.complex64: 6.06 ms, 2836.23 GFlops
torch.complex128: 188.32 ms, 91.22 GFlops
===== Setonix (AMD EPYC 7A53 + MI250X) =====
Computation time per matmul on cpu:
torch.float16: 2222.40 ms, 0.97 GFlops
torch.bfloat16: 2162.24 ms, 0.99 GFlops
torch.float32: 3.88 ms, 552.93 GFlops
torch.float64: 10.92 ms, 196.70 GFlops
torch.complex64: 11.84 ms, 181.42 GFlops
torch.complex128: 25.14 ms, 85.43 GFlops
Computation time per matmul on cuda:
torch.float16: 1.28 ms, 107018.83 GFlops
torch.bfloat16: 1.23 ms, 111472.19 GFlops
torch.float32: 4.65 ms, 29552.90 GFlops
torch.float64: 5.94 ms, 23139.49 GFlops
torch.complex64: 15.90 ms, 8644.17 GFlops
torch.complex128: 22.84 ms, 6018.62 GFlops
===== E5-2660v2 + K80 =====
Testing device: cpu
Computation time per matmul on cpu:
torch.float16: 939.43 ms, 0.29 GFlops
torch.bfloat16: 325.04 ms, 0.83 GFlops
torch.float32: 3.40 ms, 79.01 GFlops
torch.float64: 4.45 ms, 60.27 GFlops
torch.complex64: 15.32 ms, 17.52 GFlops
torch.complex128: 25.45 ms, 10.55 GFlops
Testing device: cuda
Computation time per matmul on cuda:
torch.float16: 13.34 ms, 1288.07 GFlops
torch.bfloat16: Not supported
torch.float32: 7.30 ms, 2353.09 GFlops
torch.float64: 17.22 ms, 997.54 GFlops
torch.complex64: 28.79 ms, 596.69 GFlops
torch.complex128: 75.09 ms, 228.78 GFlops
===== Google Colab + T4 =====
Computation time per matmul on cpu:
torch.float16: 3980.01 ms, 0.54 GFlops
torch.bfloat16: 87.38 ms, 24.58 GFlops
torch.float32: 29.34 ms, 73.20 GFlops
Computation time per matmul on cuda:
torch.float16: 0.80 ms, 21609.90 GFlops
torch.bfloat16: 6.77 ms, 2536.29 GFlops
torch.float32: 4.46 ms, 3851.52 GFlops
===== i5-14600 + AMD Radeon RX 6300 =====
Computation time per matmul on cpu:
torch.float16: 105.81 ms, 2.54 GFlops
torch.float32: 0.61 ms, 441.64 GFlops
Computation time per matmul on privateuseone:0: (directml, RX 6300)
torch.float16: 11.54 ms, 1488.34 GFlops
torch.float32: 18.34 ms, 936.55 GFlops
Computation time per matmul on privateuseone:1: (directml, UHD 770)
torch.float16: 4.19 ms, 4100.44 GFlops
torch.float32: 86.90 ms, 197.70 GFlops
Computation time per matmul on xpu: (xpu, UHD 770)
torch.float16: 24.10 ms, 712.84 GFlops
torch.float32: 24.53 ms, 700.45 GFlops
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment