ptrblck · August 7, 2019 10:45
diff --git a/FP16_perf_check b/FP16_perf_check
 import torch
 import time

 torch.backends.cudnn.benchmark = True


 # 1a)
 I, J, K = 64, 1024, 1024
 A = torch.randn(I, J, device='cuda', dtype=torch.half)
 B = torch.randn(J, K, device='cuda', dtype=torch.half)

 # warumup
 for _ in range(50):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()

 nb_iters = 1000
 torch.cuda.synchronize()
 t0 = time.time()
 for _ in range(nb_iters):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()
 t1 = time.time()
 print('1a) {:.3f}us per iteration)'.format((t1 - t0) / nb_iters * 1e6))

 # 1b)
 I, J, K = 1, 1024, 1024
 A = torch.randn(I, J, device='cuda', dtype=torch.half)
 B = torch.randn(J, K, device='cuda', dtype=torch.half)

 # warumup
 for _ in range(50):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()

 nb_iters = 1000
 torch.cuda.synchronize()
 t0 = time.time()
 for _ in range(nb_iters):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()
 t1 = time.time()
 print('1b) {:.3f}us per iteration'.format((t1 - t0) / nb_iters * 1e6))



 # 2a)
 I, J, K = 63, 1023, 1023
 A = torch.randn(I, J, device='cuda', dtype=torch.half)
 B = torch.randn(J, K, device='cuda', dtype=torch.half)

 # warumup
 for _ in range(50):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()

 nb_iters = 1000
 torch.cuda.synchronize()
 t0 = time.time()
 for _ in range(nb_iters):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()
 t1 = time.time()
 print('2a) {:.3f}us per iteration'.format((t1 - t0) / nb_iters * 1e6))

    
 # 2b)
 I, J, K = 1, 1023, 1023
 A = torch.randn(I, J, device='cuda', dtype=torch.half)
 B = torch.randn(J, K, device='cuda', dtype=torch.half)

 # warumup
 for _ in range(50):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()

 nb_iters = 1000
 torch.cuda.synchronize()
 t0 = time.time()
 for _ in range(nb_iters):
    C = torch.matmul(A, B)
 torch.cuda.synchronize()
 t1 = time.time()
 print('2b) {:.3f}us per iteration'.format((t1 - t0) / nb_iters * 1e6))
	import torch
	import time

	torch.backends.cudnn.benchmark = True


	# 1a)
	I, J, K = 64, 1024, 1024
	A = torch.randn(I, J, device='cuda', dtype=torch.half)
	B = torch.randn(J, K, device='cuda', dtype=torch.half)

	# warumup
	for _ in range(50):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()

	nb_iters = 1000
	torch.cuda.synchronize()
	t0 = time.time()
	for _ in range(nb_iters):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()
	t1 = time.time()
	print('1a) {:.3f}us per iteration)'.format((t1 - t0) / nb_iters * 1e6))

	# 1b)
	I, J, K = 1, 1024, 1024
	A = torch.randn(I, J, device='cuda', dtype=torch.half)
	B = torch.randn(J, K, device='cuda', dtype=torch.half)

	# warumup
	for _ in range(50):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()

	nb_iters = 1000
	torch.cuda.synchronize()
	t0 = time.time()
	for _ in range(nb_iters):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()
	t1 = time.time()
	print('1b) {:.3f}us per iteration'.format((t1 - t0) / nb_iters * 1e6))



	# 2a)
	I, J, K = 63, 1023, 1023
	A = torch.randn(I, J, device='cuda', dtype=torch.half)
	B = torch.randn(J, K, device='cuda', dtype=torch.half)

	# warumup
	for _ in range(50):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()

	nb_iters = 1000
	torch.cuda.synchronize()
	t0 = time.time()
	for _ in range(nb_iters):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()
	t1 = time.time()
	print('2a) {:.3f}us per iteration'.format((t1 - t0) / nb_iters * 1e6))


	# 2b)
	I, J, K = 1, 1023, 1023
	A = torch.randn(I, J, device='cuda', dtype=torch.half)
	B = torch.randn(J, K, device='cuda', dtype=torch.half)

	# warumup
	for _ in range(50):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()

	nb_iters = 1000
	torch.cuda.synchronize()
	t0 = time.time()
	for _ in range(nb_iters):
	C = torch.matmul(A, B)
	torch.cuda.synchronize()
	t1 = time.time()
	print('2b) {:.3f}us per iteration'.format((t1 - t0) / nb_iters * 1e6))