chn-lee-yumi · May 27, 2025 04:15
diff --git a/torch_performance_test.py b/torch_performance_test.py
 import importlib
 import time

 import torch


 def benchmark_dtype(dtype, device, size, repeat):
    try:
        # 创建两个张量
        a = torch.randn((size, size), dtype=dtype, device=device)
        b = torch.randn((size, size), dtype=dtype, device=device)

        # CUDA 加速时使用事件进行计时
        if device.type == 'cuda':
            torch.cuda.synchronize()
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)

            start.record()
            for _ in range(repeat):
                c = torch.matmul(a, b)
                torch.cuda.synchronize()
            end.record()

            elapsed_time = start.elapsed_time(end) / repeat  # 毫秒
        else:
            if device.type == 'xpu':
                torch.xpu.synchronize()
            # CPU 使用 time.perf_counter()
            start_time = time.perf_counter()
            for _ in range(repeat):
                c = torch.matmul(a, b)
                if device.type == 'xpu':
                    torch.xpu.synchronize()
            end_time = time.perf_counter()
            elapsed_time = (end_time - start_time) * 1000 / repeat  # 毫秒
        return elapsed_time
    except Exception as e:
        return None


 def test_all_dtypes(device, size=1024, repeat=100):
    dtypes = [
        torch.float16,
        # torch.bfloat16,
        torch.float32,
        # torch.float64,
        # torch.complex64,
        # torch.complex128,
    ]

    results = {}
    for dtype in dtypes:
        time_ms = benchmark_dtype(dtype, device, size, repeat)
        if time_ms is not None:
            flops = 2 * size * size * size
            gflops = flops / (time_ms / 1000) / 1e9
            results[str(dtype)] = f"{time_ms:.2f} ms, {gflops:.2f} GFlops"
        else:
            results[str(dtype)] = "Not supported"
    return results


 device_list = ["cpu", "cuda", "mps", "xpu"]  # 推理设备，可选cpu、cuda、mps、xpu
 if importlib.util.find_spec("torch_directml") is not None:  # 如果支持DirectML，则加入DirectML设备
    import torch_directml

    if torch_directml.device_count() > 0:
        device_list.append(torch_directml.device())

 for device in device_list:
    # print(f"Testing device: {device}")
    try:
        device = torch.device(device)
        t = torch.tensor([1], device=device)
        if device.type == "cpu":
            _ = test_all_dtypes(device, size=512, repeat=2)  # 第一次是预热
            results = test_all_dtypes(device, size=512, repeat=100)
        else:  # size 可能需要调整到合适的大小，太小测不出满血性能
            _ = test_all_dtypes(device, size=2048, repeat=10)  # 第一次是预热
            results = test_all_dtypes(device, size=2048, repeat=1000)
        print(f"Computation time per matmul on {device}:")
        for dtype, time_result in results.items():
            print(f"{dtype:>16}: {time_result}")
    except (AssertionError, RuntimeError):
        # print(f"Not supported device: {device}")
        continue

 """
 ===== 2018 MacBook Pro 15 (i7-8850H + Radeon Pro 555X) =====
 Testing device: cpu
 Computation time per matmul on cpu:
   torch.float16: 2319.52 ms, 0.93 GFlops
  torch.bfloat16: 1845.87 ms, 1.16 GFlops
   torch.float32: 5.80 ms, 370.48 GFlops
   torch.float64: 11.85 ms, 181.22 GFlops
 torch.complex64: 19.99 ms, 107.45 GFlops
 torch.complex128: 40.49 ms, 53.03 GFlops
 Testing device: mps
 Computation time per matmul on mps:
   torch.float16: 20.04 ms, 857.37 GFlops
  torch.bfloat16: Not supported
   torch.float32: 27.42 ms, 626.51 GFlops
   torch.float64: Not supported
 torch.complex64: Not supported
 torch.complex128: Not supported

 ===== X5650 + 2080Ti =====
 Testing device: cpu
 Computation time per matmul on cpu:
   torch.float16: 9978.52 ms, 0.22 GFlops
  torch.bfloat16: 2999.50 ms, 0.72 GFlops
   torch.float32: 35.96 ms, 59.71 GFlops
   torch.float64: 73.87 ms, 29.07 GFlops
 torch.complex64: 136.10 ms, 15.78 GFlops
 torch.complex128: 220.86 ms, 9.72 GFlops
 Testing device: cuda
 Computation time per matmul on cuda:
   torch.float16: 0.40 ms, 42776.64 GFlops
  torch.bfloat16: 2.91 ms, 5894.89 GFlops
   torch.float32: 1.57 ms, 10915.13 GFlops
   torch.float64: 49.27 ms, 348.65 GFlops
 torch.complex64: 6.06 ms, 2836.23 GFlops
 torch.complex128: 188.32 ms, 91.22 GFlops

 ===== Setonix (AMD EPYC 7A53 + MI250X) =====
 Computation time per matmul on cpu:
   torch.float16: 2222.40 ms, 0.97 GFlops
  torch.bfloat16: 2162.24 ms, 0.99 GFlops
   torch.float32: 3.88 ms, 552.93 GFlops
   torch.float64: 10.92 ms, 196.70 GFlops
 torch.complex64: 11.84 ms, 181.42 GFlops
 torch.complex128: 25.14 ms, 85.43 GFlops
 Computation time per matmul on cuda:
   torch.float16: 1.28 ms, 107018.83 GFlops
  torch.bfloat16: 1.23 ms, 111472.19 GFlops
   torch.float32: 4.65 ms, 29552.90 GFlops
   torch.float64: 5.94 ms, 23139.49 GFlops
 torch.complex64: 15.90 ms, 8644.17 GFlops
 torch.complex128: 22.84 ms, 6018.62 GFlops

 ===== E5-2660v2 + K80 =====
 Testing device: cpu
 Computation time per matmul on cpu:
   torch.float16: 939.43 ms, 0.29 GFlops
  torch.bfloat16: 325.04 ms, 0.83 GFlops
   torch.float32: 3.40 ms, 79.01 GFlops
   torch.float64: 4.45 ms, 60.27 GFlops
 torch.complex64: 15.32 ms, 17.52 GFlops
 torch.complex128: 25.45 ms, 10.55 GFlops
 Testing device: cuda
 Computation time per matmul on cuda:
   torch.float16: 13.34 ms, 1288.07 GFlops
  torch.bfloat16: Not supported
   torch.float32: 7.30 ms, 2353.09 GFlops
   torch.float64: 17.22 ms, 997.54 GFlops
 torch.complex64: 28.79 ms, 596.69 GFlops
 torch.complex128: 75.09 ms, 228.78 GFlops

 ===== Google Colab + T4 =====
 Computation time per matmul on cpu:
   torch.float16: 3980.01 ms, 0.54 GFlops
  torch.bfloat16: 87.38 ms, 24.58 GFlops
   torch.float32: 29.34 ms, 73.20 GFlops
 Computation time per matmul on cuda:
   torch.float16: 0.80 ms, 21609.90 GFlops
  torch.bfloat16: 6.77 ms, 2536.29 GFlops
   torch.float32: 4.46 ms, 3851.52 GFlops

 ===== i5-14600 + AMD Radeon RX 6300 =====
 Computation time per matmul on cpu:
 torch.float16: 105.81 ms, 2.54 GFlops
 torch.float32: 0.61 ms, 441.64 GFlops
 Computation time per matmul on privateuseone:0: (directml, RX 6300)
 torch.float16: 11.54 ms, 1488.34 GFlops
 torch.float32: 18.34 ms, 936.55 GFlops
 Computation time per matmul on privateuseone:1: (directml, UHD 770)
 torch.float16: 4.19 ms, 4100.44 GFlops
 torch.float32: 86.90 ms, 197.70 GFlops
 Computation time per matmul on xpu: (xpu, UHD 770)
 torch.float16: 24.10 ms, 712.84 GFlops
 torch.float32: 24.53 ms, 700.45 GFlops
 """
	import importlib
	import time

	import torch


	def benchmark_dtype(dtype, device, size, repeat):
	try:
	# 创建两个张量
	a = torch.randn((size, size), dtype=dtype, device=device)
	b = torch.randn((size, size), dtype=dtype, device=device)

	# CUDA 加速时使用事件进行计时
	if device.type == 'cuda':
	torch.cuda.synchronize()
	start = torch.cuda.Event(enable_timing=True)
	end = torch.cuda.Event(enable_timing=True)

	start.record()
	for _ in range(repeat):
	c = torch.matmul(a, b)
	torch.cuda.synchronize()
	end.record()

	elapsed_time = start.elapsed_time(end) / repeat # 毫秒
	else:
	if device.type == 'xpu':
	torch.xpu.synchronize()
	# CPU 使用 time.perf_counter()
	start_time = time.perf_counter()
	for _ in range(repeat):
	c = torch.matmul(a, b)
	if device.type == 'xpu':
	torch.xpu.synchronize()
	end_time = time.perf_counter()
	elapsed_time = (end_time - start_time) * 1000 / repeat # 毫秒
	return elapsed_time
	except Exception as e:
	return None


	def test_all_dtypes(device, size=1024, repeat=100):
	dtypes = [
	torch.float16,
	# torch.bfloat16,
	torch.float32,
	# torch.float64,
	# torch.complex64,
	# torch.complex128,
	]

	results = {}
	for dtype in dtypes:
	time_ms = benchmark_dtype(dtype, device, size, repeat)
	if time_ms is not None:
	flops = 2 * size * size * size
	gflops = flops / (time_ms / 1000) / 1e9
	results[str(dtype)] = f"{time_ms:.2f} ms, {gflops:.2f} GFlops"
	else:
	results[str(dtype)] = "Not supported"
	return results


	device_list = ["cpu", "cuda", "mps", "xpu"] # 推理设备，可选cpu、cuda、mps、xpu
	if importlib.util.find_spec("torch_directml") is not None: # 如果支持DirectML，则加入DirectML设备
	import torch_directml

	if torch_directml.device_count() > 0:
	device_list.append(torch_directml.device())

	for device in device_list:
	# print(f"Testing device: {device}")
	try:
	device = torch.device(device)
	t = torch.tensor([1], device=device)
	if device.type == "cpu":
	_ = test_all_dtypes(device, size=512, repeat=2) # 第一次是预热
	results = test_all_dtypes(device, size=512, repeat=100)
	else: # size 可能需要调整到合适的大小，太小测不出满血性能
	_ = test_all_dtypes(device, size=2048, repeat=10) # 第一次是预热
	results = test_all_dtypes(device, size=2048, repeat=1000)
	print(f"Computation time per matmul on {device}:")
	for dtype, time_result in results.items():
	print(f"{dtype:>16}: {time_result}")
	except (AssertionError, RuntimeError):
	# print(f"Not supported device: {device}")
	continue

	"""
	===== 2018 MacBook Pro 15 (i7-8850H + Radeon Pro 555X) =====
	Testing device: cpu
	Computation time per matmul on cpu:
	torch.float16: 2319.52 ms, 0.93 GFlops
	torch.bfloat16: 1845.87 ms, 1.16 GFlops
	torch.float32: 5.80 ms, 370.48 GFlops
	torch.float64: 11.85 ms, 181.22 GFlops
	torch.complex64: 19.99 ms, 107.45 GFlops
	torch.complex128: 40.49 ms, 53.03 GFlops
	Testing device: mps
	Computation time per matmul on mps:
	torch.float16: 20.04 ms, 857.37 GFlops
	torch.bfloat16: Not supported
	torch.float32: 27.42 ms, 626.51 GFlops
	torch.float64: Not supported
	torch.complex64: Not supported
	torch.complex128: Not supported

	===== X5650 + 2080Ti =====
	Testing device: cpu
	Computation time per matmul on cpu:
	torch.float16: 9978.52 ms, 0.22 GFlops
	torch.bfloat16: 2999.50 ms, 0.72 GFlops
	torch.float32: 35.96 ms, 59.71 GFlops
	torch.float64: 73.87 ms, 29.07 GFlops
	torch.complex64: 136.10 ms, 15.78 GFlops
	torch.complex128: 220.86 ms, 9.72 GFlops
	Testing device: cuda
	Computation time per matmul on cuda:
	torch.float16: 0.40 ms, 42776.64 GFlops
	torch.bfloat16: 2.91 ms, 5894.89 GFlops
	torch.float32: 1.57 ms, 10915.13 GFlops
	torch.float64: 49.27 ms, 348.65 GFlops
	torch.complex64: 6.06 ms, 2836.23 GFlops
	torch.complex128: 188.32 ms, 91.22 GFlops

	===== Setonix (AMD EPYC 7A53 + MI250X) =====
	Computation time per matmul on cpu:
	torch.float16: 2222.40 ms, 0.97 GFlops
	torch.bfloat16: 2162.24 ms, 0.99 GFlops
	torch.float32: 3.88 ms, 552.93 GFlops
	torch.float64: 10.92 ms, 196.70 GFlops
	torch.complex64: 11.84 ms, 181.42 GFlops
	torch.complex128: 25.14 ms, 85.43 GFlops
	Computation time per matmul on cuda:
	torch.float16: 1.28 ms, 107018.83 GFlops
	torch.bfloat16: 1.23 ms, 111472.19 GFlops
	torch.float32: 4.65 ms, 29552.90 GFlops
	torch.float64: 5.94 ms, 23139.49 GFlops
	torch.complex64: 15.90 ms, 8644.17 GFlops
	torch.complex128: 22.84 ms, 6018.62 GFlops

	===== E5-2660v2 + K80 =====
	Testing device: cpu
	Computation time per matmul on cpu:
	torch.float16: 939.43 ms, 0.29 GFlops
	torch.bfloat16: 325.04 ms, 0.83 GFlops
	torch.float32: 3.40 ms, 79.01 GFlops
	torch.float64: 4.45 ms, 60.27 GFlops
	torch.complex64: 15.32 ms, 17.52 GFlops
	torch.complex128: 25.45 ms, 10.55 GFlops
	Testing device: cuda
	Computation time per matmul on cuda:
	torch.float16: 13.34 ms, 1288.07 GFlops
	torch.bfloat16: Not supported
	torch.float32: 7.30 ms, 2353.09 GFlops
	torch.float64: 17.22 ms, 997.54 GFlops
	torch.complex64: 28.79 ms, 596.69 GFlops
	torch.complex128: 75.09 ms, 228.78 GFlops

	===== Google Colab + T4 =====
	Computation time per matmul on cpu:
	torch.float16: 3980.01 ms, 0.54 GFlops
	torch.bfloat16: 87.38 ms, 24.58 GFlops
	torch.float32: 29.34 ms, 73.20 GFlops
	Computation time per matmul on cuda:
	torch.float16: 0.80 ms, 21609.90 GFlops
	torch.bfloat16: 6.77 ms, 2536.29 GFlops
	torch.float32: 4.46 ms, 3851.52 GFlops

	===== i5-14600 + AMD Radeon RX 6300 =====
	Computation time per matmul on cpu:
	torch.float16: 105.81 ms, 2.54 GFlops
	torch.float32: 0.61 ms, 441.64 GFlops
	Computation time per matmul on privateuseone:0: (directml, RX 6300)
	torch.float16: 11.54 ms, 1488.34 GFlops
	torch.float32: 18.34 ms, 936.55 GFlops
	Computation time per matmul on privateuseone:1: (directml, UHD 770)
	torch.float16: 4.19 ms, 4100.44 GFlops
	torch.float32: 86.90 ms, 197.70 GFlops
	Computation time per matmul on xpu: (xpu, UHD 770)
	torch.float16: 24.10 ms, 712.84 GFlops
	torch.float32: 24.53 ms, 700.45 GFlops
	"""
No results found