Created
March 31, 2020 19:40
-
-
Save leofang/b466291ea822dcb6eafbf85512315a06 to your computer and use it in GitHub Desktop.
test CUB kernels
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
import numpy as np | |
import cupy as cp | |
from cupyx.time import repeat | |
shape = (256, 512, 512) | |
a = cp.random.random(shape) | |
a_np = cp.asnumpy(a) | |
CUB_supported = ['sum', 'prod', 'min', 'max', 'argmin', 'argmax'] | |
REST = ['amin', 'amax', 'nanmin', 'nanmax', 'nanargmin', 'nanargmax', | |
'mean', 'nanmean', 'var', 'nanvar', 'nansum', 'nanprod', | |
'all', 'any', 'count_nonzero'] | |
for reduce_func in CUB_supported + REST: | |
for axis in [(2,), (1, 2), (0, 1, 2)]: # [(0,1,2)]: | |
print("testing", reduce_func, "with axis = ", axis, '...') | |
func = getattr(cp, reduce_func) | |
# get numpy answer for comparison | |
if reduce_func not in ('argmin', 'argmax', 'nanargmin', 'nanargmax'): | |
ans = getattr(np, reduce_func)(a_np, axis) | |
elif len(axis) == 1: | |
ans = getattr(np, reduce_func)(a_np, axis[0]) | |
else: | |
ans = None | |
cp.cuda.cub_enabled = False | |
cp.core.cub_block_reduction_enabled = False | |
data = repeat(func, (a, axis), n=100) | |
results = [data._to_str_per_item('GPU', data.gpu_times)] | |
print('{:<10s} (old kernel):{}'.format(reduce_func, ' '.join(results))) | |
b = func(a, axis) | |
if reduce_func in CUB_supported: | |
cp.cuda.cub_enabled = True | |
cp.core.cub_block_reduction_enabled = False | |
data = repeat(func, (a, axis), n=100) | |
results = [data._to_str_per_item('GPU', data.gpu_times)] | |
print('{:<10s} (CUB device):{}'.format(reduce_func, ' '.join(results))) | |
c = func(a, axis) | |
else: | |
print('{:<10s} (CUB device):{}'.format(reduce_func, ' (CUB device-wide reduction not available)')) | |
c = None | |
cp.cuda.cub_enabled = False | |
cp.core.cub_block_reduction_enabled = True | |
data = repeat(func, (a, axis), n=100) | |
results = [data._to_str_per_item('GPU', data.gpu_times)] | |
print('{:<10s} (CUB blocks):{}'.format(reduce_func, ' '.join(results))) | |
d = func(a, axis) | |
try: | |
cp.cuda.cub_enabled = False | |
cp.core.cub_block_reduction_enabled = False | |
if ans is not None: | |
assert cp.allclose(ans, b) | |
if c is not None: | |
assert cp.allclose(ans, c) | |
assert cp.allclose(ans, d) | |
except AssertionError: | |
print("Result not match! (function: {}, axis: {})".format(reduce_func, axis), file=sys.stderr) | |
raise | |
finally: | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
CUDA 10.0 + GTX 2080 Ti: