Skip to content

Instantly share code, notes, and snippets.

@ZhijianChan
Last active September 27, 2024 17:36
Show Gist options
  • Save ZhijianChan/14a2058cb6b184da8a5baf6a586055d9 to your computer and use it in GitHub Desktop.
Save ZhijianChan/14a2058cb6b184da8a5baf6a586055d9 to your computer and use it in GitHub Desktop.
Multi-GPU GEMM comparison between PyTorch & Cupy

Snippets for Multi-GPU GEMM with PyTorch & Cupy

Steps to visualize kernel execution details

  1. Generate .nvprof files

    nvprof -o pytorch.nvprof -f python3 test_pytorch.py
    nvprof -o pytorch_mp.nvprof -f python3 test_pytorch_mp.py
    nvprof -o cupy.nvprof -f python3 test_cupy.py
    nvprof -o cupy_mp.nvprof -f python3 test_cupy_mp.py
  2. Open Nvidia Visual Profiler NVVP and load .nvprof files

Runtime Comparison

PyTorch_for PyTorch_mp Cupy_for Cupy_mp
Initialization time 41.74s 42.86s 24.69s 23.27s
Query time 2.94s 3.15s 4.01s 3.92s

Observations

  1. Multi-threading version of PyTorch is slower than "for" version.
  2. Multi-threading version of Cupy is faster than "for" version.
  3. Different GPU kernels are executed by default streams in PyTorch.
  4. Different GPU kernsls are executed by separate streams in Cupy.
  5. Kernels' execution are async in PyTorch, while there are some gaps between kernels' execution in Cupy.

Question

  1. Are there better ways to utilize multi-gpu in PyTorch or Cupy?
  2. What is the role of streams in multi-gpu execution?
#!/usr/local/python-3.6.5/bin/python3
# -*- coding: utf-8 -*-
import cupy as cp
import numpy as np
import threading
import time
import os
# Simple Snippet for using Multi-GPU with Cupy
# Implementation: for loop
# Cupy: 4.3.0
# GPU: GeForce GTX 1080 Ti
# Initialization time: 24.69s
# Query time: 4.01s
use_gpu = [4,5,6] # use GPU 4,5,6
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(t) for t in use_gpu])
n_gallery = 3000000
n_probe = 256
feat_dim = 512
n_gpu = len(use_gpu)
A_gpu = [None] * n_gpu
B_gpu = [None] * n_gpu
result = {}
with cp.cuda.profile():
# initialization
t1 = time.time()
for i in range(n_gpu):
with cp.cuda.Device(i):
A_gpu[i] = cp.asarray(np.random.rand(n_gallery//n_gpu, feat_dim)) # num_A x feat_dim
cp.cuda.Stream.null.synchronize()
# query begin
t2 = time.time()
for i in range(n_gpu):
with cp.cuda.Device(i):
B_gpu[i] = cp.asarray(np.random.rand(feat_dim, n_probe)) # feat_dim x num_B
for i in range(n_gpu):
with cp.cuda.Device(i):
result[i] = cp.dot(A_gpu[i], B_gpu[i])
for i in range(n_gpu):
output = result[i]
if isinstance(output, Exception):
raise output
output_cpu = output.get()
print('output-%d:' % i, output_cpu.shape)
t3 = time.time()
print('initialize gallery: %.2fs' % (t2-t1))
print('query time: %.2fs' % (t3-t2))
#!/usr/local/python-3.6.5/bin/python3
# -*- coding: utf-8 -*-
import cupy as cp
import numpy as np
import threading
import time
import os
# Simple Snippet for using Multi-GPU with Cupy
# Implementation: multi-threading
# Cupy: 4.3.0
# GPU: GeForce GTX 1080 Ti
# Initialization time: 23.27s
# Query time: 3.92s
use_gpu = [4,5,6] # use GPU 4,5,6
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(t) for t in use_gpu])
n_gallery = 3000000
n_probe = 256
feat_dim = 512
n_gpu = len(use_gpu)
A_gpu = [None] * n_gpu
B_gpu = [None] * n_gpu
result = {}
lock = threading.Lock()
def _worker(i, A, B):
try:
with cp.cuda.Device(i):
output = cp.dot(A,B)
with lock:
result[i] = output
except Exception as e:
with lock:
result[i] = e
t1 = time.time()
with cp.cuda.profile():
# initialization
t1 = time.time()
for i in range(n_gpu):
with cp.cuda.Device(i):
A_gpu[i] = cp.asarray(np.random.rand(n_gallery//n_gpu, feat_dim)) # num_A x feat_dim
cp.cuda.Stream.null.synchronize()
# query begin
t2 = time.time()
for i in range(n_gpu):
with cp.cuda.Device(i):
B_gpu[i] = cp.asarray(np.random.rand(feat_dim, n_probe)) # feat_dim x num_B
threads = [threading.Thread(
target=_worker,
args=(i, A_gpu[i], B_gpu[i]))
for i in range(n_gpu)]
for thread in threads: thread.start()
for thread in threads: thread.join()
for i in range(n_gpu):
output = result[i]
if isinstance(output, Exception):
raise output
output_cpu = output.get()
print('output-%d:' % i, output.shape)
t3 = time.time()
print('initialize gallery: %.2fs' % (t2-t1))
print('query time: %.2fs' % (t3-t2))
#!/usr/local/python-3.6.5/bin/python3
# -*- coding: utf-8 -*-
# Simple Snippet for using Multi-GPU with PyTorch
# Implementation: for loop
# GPU: GeForce GTX 1080 Ti
# PyTorch: 0.4.0
# Initialization time: 41.74s
# Query time: 2.94s
import torch
import numpy as np
import os
import threading
import time
use_gpu = [4,5,6] # use GPU 4,5,6
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(t) for t in use_gpu])
n_gallery = 3000000
n_probe = 256
feat_dim = 512
n_gpu = len(use_gpu)
A_gpu = [None] * n_gpu # gallery data
B_gpu = [None] * n_gpu # query data
result = {}
with torch.cuda.profiler.profile():
# initialization
t1 = time.time()
for i in range(n_gpu):
A_gpu[i] = torch.tensor(np.random.rand(n_gallery // n_gpu, feat_dim)).cuda(i)
torch.cuda.synchronize()
# query begin
t2 = time.time()
for i in range(n_gpu):
B_gpu[i] = torch.tensor(np.random.rand(n_probe, feat_dim)).cuda(i)
for i in range(n_gpu):
result[i] = torch.matmul(A_gpu[i], torch.transpose(B_gpu[i], 0, 1))
for i in range(n_gpu):
output = result[i]
if isinstance(output, Exception):
raise output
output_cpu = output.cpu()
print('output-%d:' % i, output_cpu.shape)
t3 = time.time()
print('initialize gallery: %.2fs' % (t2-t1))
print('query time: %.2fs' % (t3-t2))
#!/usr/local/python-3.6.5/bin/python3
# -*- coding: utf-8 -*-
import torch
import numpy as np
import os
import threading
import time
# Simple Snippet for using Multi-GPU with PyTorch
# Implementation: multi-threadings
# PyTorch: 0.4.0
# GPU: GeForce GTX 1080 Ti
# Initialization time: 42.86s
# Query time: 3.15s
use_gpu = [4,5,6] # use GPU 4,5,6
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(t) for t in use_gpu])
n_gallery = 3000000
n_probe = 256
feat_dim = 512
n_gpu = len(use_gpu)
A_gpu = [None] * n_gpu
B_gpu = [None] * n_gpu
result = {}
lock = threading.Lock()
def _worker(i, A, B):
try:
output = torch.matmul(A, torch.transpose(B, 0, 1))
with lock:
result[i] = output
except Exception as e:
with lock:
result[i] = e
with torch.cuda.profiler.profile():
# initialization
t1 = time.time()
for i in range(n_gpu):
A_gpu[i] = torch.tensor(np.random.rand(n_gallery // n_gpu, feat_dim)).cuda(i)
torch.cuda.synchronize()
# query begin
t2 = time.time()
for i in range(n_gpu):
B_gpu[i] = torch.tensor(np.random.rand(n_probe, feat_dim)).cuda(i)
threads = [threading.Thread(
target=_worker,
args=(i, A_gpu[i], B_gpu[i]))
for i in range(n_gpu)]
for thread in threads: thread.start()
for thread in threads: thread.join()
for i in range(n_gpu):
output = result[i]
if isinstance(output, Exception):
raise output
output_cpu = output.cpu()
print('output-%d:' % i, output_cpu.shape)
t3 = time.time()
print('initialize gallery: %.2fs' % (t2-t1))
print('query time: %.2fs' % (t3-t2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment