Created
August 7, 2020 11:16
-
-
Save gmarkall/161aa1fe708fe06e43432b6a4f36e7e2 to your computer and use it in GitHub Desktop.
Numba CUDA kernel compile and launch traces
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ NUMBA_CUDA_LOG_LEVEL=DEBUG python repro.py | |
== CUDA [182] DEBUG -- call runtime api: cudaRuntimeGetVersion | |
Define a kernel | |
Copy to device | |
== CUDA [200] INFO -- init | |
== CUDA [200] DEBUG -- call driver api: cuInit | |
== CUDA [200] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [200] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [200] DEBUG -- call driver api: cuDeviceGetCount | |
== CUDA [200] DEBUG -- call driver api: cuDeviceGet | |
== CUDA [200] DEBUG -- call driver api: cuDeviceComputeCapability | |
== CUDA [200] DEBUG -- call driver api: cuDeviceGetName | |
== CUDA [200] DEBUG -- call driver api: cuDevicePrimaryCtxRetain | |
== CUDA [299] DEBUG -- call driver api: cuCtxPushCurrent_v2 | |
== CUDA [299] DEBUG -- call driver api: cuMemGetInfo_v2 | |
== CUDA [299] DEBUG -- call driver api: cuMemAlloc_v2 | |
== CUDA [300] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [300] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [300] DEBUG -- call driver api: cuMemcpyHtoD_v2 | |
== CUDA [300] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [300] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [300] DEBUG -- call driver api: cuMemAlloc_v2 | |
== CUDA [300] DEBUG -- call driver api: cuMemcpyHtoD_v2 | |
== CUDA [300] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [301] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [301] DEBUG -- call driver api: cuMemAlloc_v2 | |
Start first run | |
== CUDA [308] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [308] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [365] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [365] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [365] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [365] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [400] DEBUG -- call driver api: cuLinkCreate_v2 | |
== CUDA [401] DEBUG -- call driver api: cuLinkAddData_v2 | |
== CUDA [401] DEBUG -- call driver api: cuLinkComplete | |
== CUDA [401] DEBUG -- call driver api: cuModuleLoadDataEx | |
== CUDA [402] DEBUG -- call driver api: cuModuleGetFunction | |
== CUDA [402] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [402] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [402] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [402] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [402] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [402] DEBUG -- call driver api: cuLinkDestroy | |
== CUDA [402] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [402] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [402] DEBUG -- call driver api: cuLaunchKernel | |
== CUDA [402] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [402] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [402] DEBUG -- call driver api: cuCtxSynchronize | |
First run: 0.10183905500161927 | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [403] DEBUG -- call driver api: cuLaunchKernel | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [403] DEBUG -- call driver api: cuCtxSynchronize | |
Second run: 0.0004136649949941784 | |
Define another kernel | |
Start first run | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [403] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [424] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [424] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [424] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [424] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [455] DEBUG -- call driver api: cuLinkCreate_v2 | |
== CUDA [455] DEBUG -- call driver api: cuLinkAddData_v2 | |
== CUDA [455] DEBUG -- call driver api: cuLinkComplete | |
== CUDA [455] DEBUG -- call driver api: cuModuleLoadDataEx | |
== CUDA [455] DEBUG -- call driver api: cuModuleGetFunction | |
== CUDA [455] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [455] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [456] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [456] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [456] DEBUG -- call driver api: cuFuncGetAttribute | |
== CUDA [456] DEBUG -- call driver api: cuLinkDestroy | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [456] DEBUG -- call driver api: cuLaunchKernel | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [456] DEBUG -- call driver api: cuCtxSynchronize | |
First run: 0.05303482800081838 | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [456] DEBUG -- call driver api: cuLaunchKernel | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetCurrent | |
== CUDA [456] DEBUG -- call driver api: cuCtxGetDevice | |
== CUDA [456] DEBUG -- call driver api: cuCtxSynchronize | |
Second run: 0.0004158869996899739 | |
== CUDA [456] INFO -- add pending dealloc: module_unload ? bytes | |
== CUDA [456] INFO -- add pending dealloc: module_unload ? bytes | |
== CUDA [457] INFO -- add pending dealloc: cuMemFree_v2 262144 bytes | |
== CUDA [457] INFO -- add pending dealloc: cuMemFree_v2 262144 bytes | |
== CUDA [457] INFO -- add pending dealloc: cuMemFree_v2 262144 bytes |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numba import cuda | |
import numpy as np | |
import time | |
print("Define a kernel") | |
@cuda.jit | |
def add(r, x, y): | |
i = cuda.grid(1) | |
if i < len(r): | |
r[i] = x[i] + y[i] | |
N = 32768 | |
x = np.ones(N) | |
y = np.ones(N) | |
print("Copy to device") | |
d_x = cuda.to_device(x) | |
d_y = cuda.to_device(y) | |
d_r = cuda.device_array_like(x) | |
blockdim = 128 | |
griddim = N // blockdim | |
def run_add(): | |
start = time.perf_counter() | |
add[griddim, blockdim](d_r, d_x, d_y) | |
cuda.synchronize() | |
end = time.perf_counter() | |
return end - start | |
print("Start first run") | |
first_time = run_add() | |
print(f'First run: {first_time}') | |
second_time = run_add() | |
print(f'Second run: {second_time}') | |
print("Define another kernel") | |
@cuda.jit | |
def sub(r, x, y): | |
i = cuda.grid(1) | |
if i < len(r): | |
r[i] = x[i] - y[i] | |
def run_sub(): | |
start = time.perf_counter() | |
sub[griddim, blockdim](d_r, d_x, d_y) | |
cuda.synchronize() | |
end = time.perf_counter() | |
return end - start | |
print("Start first run") | |
first_time = run_sub() | |
print(f'First run: {first_time}') | |
second_time = run_sub() | |
print(f'Second run: {second_time}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment