Last active
May 27, 2025 04:15
-
-
Save chn-lee-yumi/e0a4e515e4f7cf1c135bb8df048140fc to your computer and use it in GitHub Desktop.
Torch Performance Test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import importlib | |
| import time | |
| import torch | |
| def benchmark_dtype(dtype, device, size, repeat): | |
| try: | |
| # 创建两个张量 | |
| a = torch.randn((size, size), dtype=dtype, device=device) | |
| b = torch.randn((size, size), dtype=dtype, device=device) | |
| # CUDA 加速时使用事件进行计时 | |
| if device.type == 'cuda': | |
| torch.cuda.synchronize() | |
| start = torch.cuda.Event(enable_timing=True) | |
| end = torch.cuda.Event(enable_timing=True) | |
| start.record() | |
| for _ in range(repeat): | |
| c = torch.matmul(a, b) | |
| torch.cuda.synchronize() | |
| end.record() | |
| elapsed_time = start.elapsed_time(end) / repeat # 毫秒 | |
| else: | |
| if device.type == 'xpu': | |
| torch.xpu.synchronize() | |
| # CPU 使用 time.perf_counter() | |
| start_time = time.perf_counter() | |
| for _ in range(repeat): | |
| c = torch.matmul(a, b) | |
| if device.type == 'xpu': | |
| torch.xpu.synchronize() | |
| end_time = time.perf_counter() | |
| elapsed_time = (end_time - start_time) * 1000 / repeat # 毫秒 | |
| return elapsed_time | |
| except Exception as e: | |
| return None | |
| def test_all_dtypes(device, size=1024, repeat=100): | |
| dtypes = [ | |
| torch.float16, | |
| # torch.bfloat16, | |
| torch.float32, | |
| # torch.float64, | |
| # torch.complex64, | |
| # torch.complex128, | |
| ] | |
| results = {} | |
| for dtype in dtypes: | |
| time_ms = benchmark_dtype(dtype, device, size, repeat) | |
| if time_ms is not None: | |
| flops = 2 * size * size * size | |
| gflops = flops / (time_ms / 1000) / 1e9 | |
| results[str(dtype)] = f"{time_ms:.2f} ms, {gflops:.2f} GFlops" | |
| else: | |
| results[str(dtype)] = "Not supported" | |
| return results | |
| device_list = ["cpu", "cuda", "mps", "xpu"] # 推理设备,可选cpu、cuda、mps、xpu | |
| if importlib.util.find_spec("torch_directml") is not None: # 如果支持DirectML,则加入DirectML设备 | |
| import torch_directml | |
| if torch_directml.device_count() > 0: | |
| device_list.append(torch_directml.device()) | |
| for device in device_list: | |
| # print(f"Testing device: {device}") | |
| try: | |
| device = torch.device(device) | |
| t = torch.tensor([1], device=device) | |
| if device.type == "cpu": | |
| _ = test_all_dtypes(device, size=512, repeat=2) # 第一次是预热 | |
| results = test_all_dtypes(device, size=512, repeat=100) | |
| else: # size 可能需要调整到合适的大小,太小测不出满血性能 | |
| _ = test_all_dtypes(device, size=2048, repeat=10) # 第一次是预热 | |
| results = test_all_dtypes(device, size=2048, repeat=1000) | |
| print(f"Computation time per matmul on {device}:") | |
| for dtype, time_result in results.items(): | |
| print(f"{dtype:>16}: {time_result}") | |
| except (AssertionError, RuntimeError): | |
| # print(f"Not supported device: {device}") | |
| continue | |
| """ | |
| ===== 2018 MacBook Pro 15 (i7-8850H + Radeon Pro 555X) ===== | |
| Testing device: cpu | |
| Computation time per matmul on cpu: | |
| torch.float16: 2319.52 ms, 0.93 GFlops | |
| torch.bfloat16: 1845.87 ms, 1.16 GFlops | |
| torch.float32: 5.80 ms, 370.48 GFlops | |
| torch.float64: 11.85 ms, 181.22 GFlops | |
| torch.complex64: 19.99 ms, 107.45 GFlops | |
| torch.complex128: 40.49 ms, 53.03 GFlops | |
| Testing device: mps | |
| Computation time per matmul on mps: | |
| torch.float16: 20.04 ms, 857.37 GFlops | |
| torch.bfloat16: Not supported | |
| torch.float32: 27.42 ms, 626.51 GFlops | |
| torch.float64: Not supported | |
| torch.complex64: Not supported | |
| torch.complex128: Not supported | |
| ===== X5650 + 2080Ti ===== | |
| Testing device: cpu | |
| Computation time per matmul on cpu: | |
| torch.float16: 9978.52 ms, 0.22 GFlops | |
| torch.bfloat16: 2999.50 ms, 0.72 GFlops | |
| torch.float32: 35.96 ms, 59.71 GFlops | |
| torch.float64: 73.87 ms, 29.07 GFlops | |
| torch.complex64: 136.10 ms, 15.78 GFlops | |
| torch.complex128: 220.86 ms, 9.72 GFlops | |
| Testing device: cuda | |
| Computation time per matmul on cuda: | |
| torch.float16: 0.40 ms, 42776.64 GFlops | |
| torch.bfloat16: 2.91 ms, 5894.89 GFlops | |
| torch.float32: 1.57 ms, 10915.13 GFlops | |
| torch.float64: 49.27 ms, 348.65 GFlops | |
| torch.complex64: 6.06 ms, 2836.23 GFlops | |
| torch.complex128: 188.32 ms, 91.22 GFlops | |
| ===== Setonix (AMD EPYC 7A53 + MI250X) ===== | |
| Computation time per matmul on cpu: | |
| torch.float16: 2222.40 ms, 0.97 GFlops | |
| torch.bfloat16: 2162.24 ms, 0.99 GFlops | |
| torch.float32: 3.88 ms, 552.93 GFlops | |
| torch.float64: 10.92 ms, 196.70 GFlops | |
| torch.complex64: 11.84 ms, 181.42 GFlops | |
| torch.complex128: 25.14 ms, 85.43 GFlops | |
| Computation time per matmul on cuda: | |
| torch.float16: 1.28 ms, 107018.83 GFlops | |
| torch.bfloat16: 1.23 ms, 111472.19 GFlops | |
| torch.float32: 4.65 ms, 29552.90 GFlops | |
| torch.float64: 5.94 ms, 23139.49 GFlops | |
| torch.complex64: 15.90 ms, 8644.17 GFlops | |
| torch.complex128: 22.84 ms, 6018.62 GFlops | |
| ===== E5-2660v2 + K80 ===== | |
| Testing device: cpu | |
| Computation time per matmul on cpu: | |
| torch.float16: 939.43 ms, 0.29 GFlops | |
| torch.bfloat16: 325.04 ms, 0.83 GFlops | |
| torch.float32: 3.40 ms, 79.01 GFlops | |
| torch.float64: 4.45 ms, 60.27 GFlops | |
| torch.complex64: 15.32 ms, 17.52 GFlops | |
| torch.complex128: 25.45 ms, 10.55 GFlops | |
| Testing device: cuda | |
| Computation time per matmul on cuda: | |
| torch.float16: 13.34 ms, 1288.07 GFlops | |
| torch.bfloat16: Not supported | |
| torch.float32: 7.30 ms, 2353.09 GFlops | |
| torch.float64: 17.22 ms, 997.54 GFlops | |
| torch.complex64: 28.79 ms, 596.69 GFlops | |
| torch.complex128: 75.09 ms, 228.78 GFlops | |
| ===== Google Colab + T4 ===== | |
| Computation time per matmul on cpu: | |
| torch.float16: 3980.01 ms, 0.54 GFlops | |
| torch.bfloat16: 87.38 ms, 24.58 GFlops | |
| torch.float32: 29.34 ms, 73.20 GFlops | |
| Computation time per matmul on cuda: | |
| torch.float16: 0.80 ms, 21609.90 GFlops | |
| torch.bfloat16: 6.77 ms, 2536.29 GFlops | |
| torch.float32: 4.46 ms, 3851.52 GFlops | |
| ===== i5-14600 + AMD Radeon RX 6300 ===== | |
| Computation time per matmul on cpu: | |
| torch.float16: 105.81 ms, 2.54 GFlops | |
| torch.float32: 0.61 ms, 441.64 GFlops | |
| Computation time per matmul on privateuseone:0: (directml, RX 6300) | |
| torch.float16: 11.54 ms, 1488.34 GFlops | |
| torch.float32: 18.34 ms, 936.55 GFlops | |
| Computation time per matmul on privateuseone:1: (directml, UHD 770) | |
| torch.float16: 4.19 ms, 4100.44 GFlops | |
| torch.float32: 86.90 ms, 197.70 GFlops | |
| Computation time per matmul on xpu: (xpu, UHD 770) | |
| torch.float16: 24.10 ms, 712.84 GFlops | |
| torch.float32: 24.53 ms, 700.45 GFlops | |
| """ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment