Created
March 31, 2020 19:40
-
-
Save leofang/b466291ea822dcb6eafbf85512315a06 to your computer and use it in GitHub Desktop.
test CUB kernels
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
import numpy as np | |
import cupy as cp | |
from cupyx.time import repeat | |
shape = (256, 512, 512) | |
a = cp.random.random(shape) | |
a_np = cp.asnumpy(a) | |
CUB_supported = ['sum', 'prod', 'min', 'max', 'argmin', 'argmax'] | |
REST = ['amin', 'amax', 'nanmin', 'nanmax', 'nanargmin', 'nanargmax', | |
'mean', 'nanmean', 'var', 'nanvar', 'nansum', 'nanprod', | |
'all', 'any', 'count_nonzero'] | |
for reduce_func in CUB_supported + REST: | |
for axis in [(2,), (1, 2), (0, 1, 2)]: # [(0,1,2)]: | |
print("testing", reduce_func, "with axis = ", axis, '...') | |
func = getattr(cp, reduce_func) | |
# get numpy answer for comparison | |
if reduce_func not in ('argmin', 'argmax', 'nanargmin', 'nanargmax'): | |
ans = getattr(np, reduce_func)(a_np, axis) | |
elif len(axis) == 1: | |
ans = getattr(np, reduce_func)(a_np, axis[0]) | |
else: | |
ans = None | |
cp.cuda.cub_enabled = False | |
cp.core.cub_block_reduction_enabled = False | |
data = repeat(func, (a, axis), n=100) | |
results = [data._to_str_per_item('GPU', data.gpu_times)] | |
print('{:<10s} (old kernel):{}'.format(reduce_func, ' '.join(results))) | |
b = func(a, axis) | |
if reduce_func in CUB_supported: | |
cp.cuda.cub_enabled = True | |
cp.core.cub_block_reduction_enabled = False | |
data = repeat(func, (a, axis), n=100) | |
results = [data._to_str_per_item('GPU', data.gpu_times)] | |
print('{:<10s} (CUB device):{}'.format(reduce_func, ' '.join(results))) | |
c = func(a, axis) | |
else: | |
print('{:<10s} (CUB device):{}'.format(reduce_func, ' (CUB device-wide reduction not available)')) | |
c = None | |
cp.cuda.cub_enabled = False | |
cp.core.cub_block_reduction_enabled = True | |
data = repeat(func, (a, axis), n=100) | |
results = [data._to_str_per_item('GPU', data.gpu_times)] | |
print('{:<10s} (CUB blocks):{}'.format(reduce_func, ' '.join(results))) | |
d = func(a, axis) | |
try: | |
cp.cuda.cub_enabled = False | |
cp.core.cub_block_reduction_enabled = False | |
if ans is not None: | |
assert cp.allclose(ans, b) | |
if c is not None: | |
assert cp.allclose(ans, c) | |
assert cp.allclose(ans, d) | |
except AssertionError: | |
print("Result not match! (function: {}, axis: {})".format(reduce_func, axis), file=sys.stderr) | |
raise | |
finally: | |
print() |
CUDA 10.0 + GTX 2080 Ti:
testing sum with axis = (2,) ...
sum (old kernel): GPU: 4159.497 us +/-405.757 (min: 3860.896 / max: 5381.728) us
sum (CUB device): GPU: 1142.829 us +/-35.089 (min: 986.304 / max: 1169.760) us
sum (CUB blocks): GPU: 1138.303 us +/-11.385 (min: 1131.520 / max: 1217.984) us
testing sum with axis = (1, 2) ...
sum (old kernel): GPU: 1083.180 us +/-53.013 (min: 938.624 / max: 1113.952) us
sum (CUB device): GPU: 1117.975 us +/-27.751 (min: 957.120 / max: 1132.416) us
sum (CUB blocks): GPU: 1130.267 us +/-27.995 (min: 971.200 / max: 1150.368) us
testing sum with axis = (0, 1, 2) ...
sum (old kernel): GPU:38371.774 us +/-32.665 (min:38208.576 / max:38410.976) us
sum (CUB device): GPU: 1097.675 us +/-27.765 (min: 938.336 / max: 1113.280) us
sum (CUB blocks): GPU: 1167.509 us +/-26.820 (min: 1044.896 / max: 1189.824) us
testing prod with axis = (2,) ...
prod (old kernel): GPU: 4030.226 us +/-18.521 (min: 3873.664 / max: 4079.168) us
prod (CUB device): GPU: 1152.048 us +/- 2.670 (min: 1146.112 / max: 1164.064) us
prod (CUB blocks): GPU: 1122.420 us +/-56.175 (min: 972.736 / max: 1201.024) us
testing prod with axis = (1, 2) ...
prod (old kernel): GPU: 1077.918 us +/-57.911 (min: 937.536 / max: 1113.440) us
prod (CUB device): GPU: 1120.289 us +/-23.133 (min: 958.336 / max: 1138.656) us
prod (CUB blocks): GPU: 1130.728 us +/-27.959 (min: 971.008 / max: 1151.136) us
testing prod with axis = (0, 1, 2) ...
prod (old kernel): GPU:38373.287 us +/-19.281 (min:38206.017 / max:38432.896) us
prod (CUB device): GPU: 1099.334 us +/-22.900 (min: 938.496 / max: 1108.512) us
prod (CUB blocks): GPU: 1167.291 us +/-21.194 (min: 1040.672 / max: 1183.616) us
testing min with axis = (2,) ...
min (old kernel): GPU: 4084.754 us +/-34.300 (min: 3910.240 / max: 4121.600) us
min (CUB device): GPU: 2366.779 us +/-28.071 (min: 2207.456 / max: 2381.120) us
min (CUB blocks): GPU: 2287.402 us +/-27.909 (min: 2128.704 / max: 2296.224) us
testing min with axis = (1, 2) ...
min (old kernel): GPU: 1209.743 us +/-45.193 (min: 1058.816 / max: 1281.920) us
min (CUB device): GPU: 1188.406 us +/-71.327 (min: 1067.072 / max: 1244.864) us
min (CUB blocks): GPU: 3079.536 us +/-16.804 (min: 2913.856 / max: 3088.384) us
testing min with axis = (0, 1, 2) ...
min (old kernel): GPU:65523.585 us +/-24.032 (min:65359.619 / max:65550.018) us
min (CUB device): GPU: 1172.083 us +/-15.943 (min: 1015.936 / max: 1188.672) us
min (CUB blocks): GPU: 2754.994 us +/-12.025 (min: 2717.440 / max: 2769.952) us
testing max with axis = (2,) ...
max (old kernel): GPU: 4098.848 us +/-23.508 (min: 3934.336 / max: 4111.936) us
max (CUB device): GPU: 2370.094 us +/-22.693 (min: 2205.504 / max: 2406.944) us
max (CUB blocks): GPU: 2281.976 us +/-41.136 (min: 2128.896 / max: 2323.712) us
testing max with axis = (1, 2) ...
max (old kernel): GPU: 1218.720 us +/-24.551 (min: 1061.344 / max: 1231.136) us
max (CUB device): GPU: 1230.850 us +/-16.701 (min: 1069.664 / max: 1261.152) us
max (CUB blocks): GPU: 3072.603 us +/-36.640 (min: 2914.592 / max: 3090.624) us
testing max with axis = (0, 1, 2) ...
max (old kernel): GPU:65528.301 us +/-23.639 (min:65365.822 / max:65551.453) us
max (CUB device): GPU: 1174.942 us +/- 7.697 (min: 1161.280 / max: 1241.696) us
max (CUB blocks): GPU: 2757.204 us +/- 9.050 (min: 2717.664 / max: 2778.400) us
testing argmin with axis = (2,) ...
argmin (old kernel): GPU: 4461.026 us +/-29.950 (min: 4299.520 / max: 4502.752) us
argmin (CUB device): GPU: 4471.823 us +/-31.472 (min: 4313.024 / max: 4491.456) us
argmin (CUB blocks): GPU: 3019.353 us +/-23.323 (min: 2856.768 / max: 3037.984) us
testing argmin with axis = (1, 2) ...
argmin (old kernel): GPU: 1560.079 us +/-22.999 (min: 1400.320 / max: 1577.056) us
argmin (CUB device): GPU: 1560.169 us +/-31.487 (min: 1402.240 / max: 1579.872) us
argmin (CUB blocks): GPU: 3961.720 us +/-28.149 (min: 3802.336 / max: 3984.064) us
testing argmin with axis = (0, 1, 2) ...
argmin (old kernel): GPU:88031.716 us +/-26.109 (min:87861.633 / max:88106.689) us
argmin (CUB device): GPU: 1506.377 us +/-46.813 (min: 1371.392 / max: 1550.944) us
argmin (CUB blocks): GPU: 3474.096 us +/- 7.337 (min: 3459.936 / max: 3495.424) us
testing argmax with axis = (2,) ...
argmax (old kernel): GPU: 4473.267 us +/-29.312 (min: 4310.208 / max: 4507.296) us
argmax (CUB device): GPU: 4477.163 us +/-25.553 (min: 4312.800 / max: 4507.744) us
argmax (CUB blocks): GPU: 3023.725 us +/-18.631 (min: 2858.144 / max: 3046.112) us
testing argmax with axis = (1, 2) ...
argmax (old kernel): GPU: 1562.218 us +/-32.919 (min: 1400.192 / max: 1611.584) us
argmax (CUB device): GPU: 1569.789 us +/-40.921 (min: 1402.784 / max: 1608.992) us
argmax (CUB blocks): GPU: 4000.629 us +/-26.367 (min: 3835.328 / max: 4026.048) us
testing argmax with axis = (0, 1, 2) ...
argmax (old kernel): GPU:88749.531 us +/-31.298 (min:88575.394 / max:88814.140) us
argmax (CUB device): GPU: 1527.943 us +/-21.475 (min: 1386.144 / max: 1559.424) us
argmax (CUB blocks): GPU: 3495.164 us +/- 4.540 (min: 3487.296 / max: 3511.808) us
testing amin with axis = (2,) ...
amin (old kernel): GPU: 4158.543 us +/-38.735 (min: 3998.624 / max: 4192.576) us
amin (CUB device): (CUB device-wide reduction not available)
amin (CUB blocks): GPU: 2330.791 us +/-16.167 (min: 2172.736 / max: 2341.696) us
testing amin with axis = (1, 2) ...
amin (old kernel): GPU: 1240.774 us +/-11.165 (min: 1133.600 / max: 1249.952) us
amin (CUB device): (CUB device-wide reduction not available)
amin (CUB blocks): GPU: 3121.667 us +/-33.249 (min: 2961.120 / max: 3164.640) us
testing amin with axis = (0, 1, 2) ...
amin (old kernel): GPU:66612.955 us +/-20.001 (min:66442.848 / max:66666.557) us
amin (CUB device): (CUB device-wide reduction not available)
amin (CUB blocks): GPU: 2798.982 us +/- 4.875 (min: 2760.896 / max: 2812.384) us
testing amax with axis = (2,) ...
amax (old kernel): GPU: 4153.098 us +/-38.910 (min: 3998.016 / max: 4178.048) us
amax (CUB device): (CUB device-wide reduction not available)
amax (CUB blocks): GPU: 2327.452 us +/-16.746 (min: 2163.904 / max: 2341.216) us
testing amax with axis = (1, 2) ...
amax (old kernel): GPU: 1239.937 us +/-16.702 (min: 1077.504 / max: 1247.616) us
amax (CUB device): (CUB device-wide reduction not available)
amax (CUB blocks): GPU: 3122.654 us +/-18.702 (min: 2962.080 / max: 3141.216) us
testing amax with axis = (0, 1, 2) ...
amax (old kernel): GPU:66607.313 us +/-29.667 (min:66440.926 / max:66660.637) us
amax (CUB device): (CUB device-wide reduction not available)
amax (CUB blocks): GPU: 2799.492 us +/- 6.595 (min: 2760.896 / max: 2813.920) us
testing nanmin with axis = (2,) ...
nanmin (old kernel): GPU: 3748.271 us +/-53.535 (min: 3613.216 / max: 3859.488) us
nanmin (CUB device): (CUB device-wide reduction not available)
nanmin (CUB blocks): GPU: 1126.258 us +/-27.870 (min: 965.664 / max: 1138.272) us
testing nanmin with axis = (1, 2) ...
nanmin (old kernel): GPU: 1120.401 us +/-28.170 (min: 959.648 / max: 1134.272) us
nanmin (CUB device): (CUB device-wide reduction not available)
nanmin (CUB blocks): GPU: 1415.120 us +/-31.366 (min: 1260.992 / max: 1431.392) us
testing nanmin with axis = (0, 1, 2) ...
nanmin (old kernel): GPU:44062.278 us +/-35.477 (min:43886.337 / max:44105.282) us
nanmin (CUB device): (CUB device-wide reduction not available)
nanmin (CUB blocks): GPU: 1334.702 us +/-10.854 (min: 1244.608 / max: 1351.104) us
testing nanmax with axis = (2,) ...
nanmax (old kernel): GPU: 3744.612 us +/-44.138 (min: 3632.800 / max: 3846.304) us
nanmax (CUB device): (CUB device-wide reduction not available)
nanmax (CUB blocks): GPU: 1128.282 us +/-23.219 (min: 966.560 / max: 1150.560) us
testing nanmax with axis = (1, 2) ...
nanmax (old kernel): GPU: 1115.195 us +/-41.428 (min: 957.184 / max: 1139.648) us
nanmax (CUB device): (CUB device-wide reduction not available)
nanmax (CUB blocks): GPU: 1421.446 us +/-18.968 (min: 1260.480 / max: 1475.328) us
testing nanmax with axis = (0, 1, 2) ...
nanmax (old kernel): GPU:44058.180 us +/-41.610 (min:43879.841 / max:44128.609) us
nanmax (CUB device): (CUB device-wide reduction not available)
nanmax (CUB blocks): GPU: 1325.475 us +/-30.487 (min: 1235.840 / max: 1356.640) us
testing nanargmin with axis = (2,) ...
nanargmin (old kernel): GPU: 4764.826 us +/-32.746 (min: 4604.000 / max: 4803.488) us
nanargmin (CUB device): (CUB device-wide reduction not available)
nanargmin (CUB blocks): GPU: 3365.695 us +/-33.226 (min: 3203.872 / max: 3419.648) us
testing nanargmin with axis = (1, 2) ...
nanargmin (old kernel): GPU: 1920.273 us +/-23.212 (min: 1758.720 / max: 1936.128) us
nanargmin (CUB device): (CUB device-wide reduction not available)
nanargmin (CUB blocks): GPU: 4288.908 us +/-27.625 (min: 4127.520 / max: 4306.304) us
testing nanargmin with axis = (0, 1, 2) ...
nanargmin (old kernel): GPU:110877.606 us +/-23.823 (min:110721.474 / max:110942.268) us
nanargmin (CUB device): (CUB device-wide reduction not available)
nanargmin (CUB blocks): GPU: 3784.081 us +/- 4.173 (min: 3775.072 / max: 3804.800) us
testing nanargmax with axis = (2,) ...
nanargmax (old kernel): GPU: 4781.100 us +/-43.501 (min: 4605.216 / max: 4820.672) us
nanargmax (CUB device): (CUB device-wide reduction not available)
nanargmax (CUB blocks): GPU: 3389.230 us +/-23.426 (min: 3228.288 / max: 3422.912) us
testing nanargmax with axis = (1, 2) ...
nanargmax (old kernel): GPU: 1932.439 us +/-19.733 (min: 1774.432 / max: 1941.728) us
nanargmax (CUB device): (CUB device-wide reduction not available)
nanargmax (CUB blocks): GPU: 4313.878 us +/-35.263 (min: 4156.384 / max: 4336.704) us
testing nanargmax with axis = (0, 1, 2) ...
nanargmax (old kernel): GPU:111807.415 us +/-25.919 (min:111639.137 / max:111855.804) us
nanargmax (CUB device): (CUB device-wide reduction not available)
nanargmax (CUB blocks): GPU: 3808.783 us +/- 3.996 (min: 3799.424 / max: 3821.568) us
testing mean with axis = (2,) ...
mean (old kernel): GPU: 4608.910 us +/-23.902 (min: 4448.192 / max: 4629.056) us
mean (CUB device): (CUB device-wide reduction not available)
mean (CUB blocks): GPU: 1297.813 us +/-27.920 (min: 1138.912 / max: 1319.680) us
testing mean with axis = (1, 2) ...
mean (old kernel): GPU: 1099.979 us +/-22.353 (min: 941.952 / max: 1110.656) us
mean (CUB device): (CUB device-wide reduction not available)
mean (CUB blocks): GPU: 1151.320 us +/-11.933 (min: 1036.576 / max: 1167.872) us
testing mean with axis = (0, 1, 2) ...
mean (old kernel): GPU:39350.566 us +/-23.792 (min:39193.569 / max:39391.457) us
mean (CUB device): (CUB device-wide reduction not available)
mean (CUB blocks): GPU: 1199.286 us +/-19.951 (min: 1077.056 / max: 1218.720) us
testing nanmean with axis = (2,) ...
nanmean (old kernel): GPU: 3170.737 us +/-21.619 (min: 3008.896 / max: 3204.000) us
nanmean (CUB device): (CUB device-wide reduction not available)
nanmean (CUB blocks): GPU: 1530.691 us +/-37.290 (min: 1380.032 / max: 1548.960) us
testing nanmean with axis = (1, 2) ...
nanmean (old kernel): GPU: 1105.564 us +/-35.401 (min: 948.768 / max: 1127.968) us
nanmean (CUB device): (CUB device-wide reduction not available)
nanmean (CUB blocks): GPU: 1490.137 us +/-16.748 (min: 1326.528 / max: 1507.136) us
testing nanmean with axis = (0, 1, 2) ...
nanmean (old kernel): GPU:48621.619 us +/-33.912 (min:48461.346 / max:48662.529) us
nanmean (CUB device): (CUB device-wide reduction not available)
nanmean (CUB blocks): GPU: 1518.977 us +/-21.871 (min: 1395.968 / max: 1544.160) us
testing var with axis = (2,) ...
var (old kernel): GPU: 6004.076 us +/-69.531 (min: 5687.840 / max: 6040.224) us
var (CUB device): (CUB device-wide reduction not available)
var (CUB blocks): GPU: 2748.602 us +/-60.475 (min: 2411.712 / max: 2832.192) us
testing var with axis = (1, 2) ...
var (old kernel): GPU: 9496.720 us +/-39.205 (min: 9334.592 / max: 9566.240) us
var (CUB device): (CUB device-wide reduction not available)
var (CUB blocks): GPU: 9543.002 us +/-43.328 (min: 9379.264 / max: 9595.968) us
testing var with axis = (0, 1, 2) ...
var (old kernel): GPU:1277100.681 us +/-248.080 (min:1276486.694 / max:1277801.025) us
var (CUB device): (CUB device-wide reduction not available)
var (CUB blocks): GPU:46306.821 us +/-39.049 (min:46137.119 / max:46408.001) us
testing nanvar with axis = (2,) ...
nanvar (old kernel): GPU:12259.616 us +/-31.414 (min:12096.192 / max:12284.320) us
nanvar (CUB device): (CUB device-wide reduction not available)
nanvar (CUB blocks): GPU: 8146.314 us +/-23.980 (min: 7981.248 / max: 8182.304) us
testing nanvar with axis = (1, 2) ...
nanvar (old kernel): GPU:49811.653 us +/-144.943 (min:49449.310 / max:50002.239) us
nanvar (CUB device): (CUB device-wide reduction not available)
nanvar (CUB blocks): GPU:50196.915 us +/-145.196 (min:49802.654 / max:50374.561) us
testing nanvar with axis = (0, 1, 2) ...
nanvar (old kernel): GPU:3455061.255 us +/-166.683 (min:3453450.195 / max:3455130.615) us
nanvar (CUB device): (CUB device-wide reduction not available)
nanvar (CUB blocks): GPU:384908.101 us +/-497.698 (min:381440.948 / max:385063.110) us
testing nansum with axis = (2,) ...
nansum (old kernel): GPU: 4583.659 us +/-32.734 (min: 4422.624 / max: 4637.920) us
nansum (CUB device): (CUB device-wide reduction not available)
nansum (CUB blocks): GPU: 1493.893 us +/-19.721 (min: 1333.056 / max: 1509.568) us
testing nansum with axis = (1, 2) ...
nansum (old kernel): GPU: 1109.703 us +/-22.828 (min: 950.496 / max: 1132.064) us
nansum (CUB device): (CUB device-wide reduction not available)
nansum (CUB blocks): GPU: 1471.623 us +/-17.535 (min: 1309.056 / max: 1526.176) us
testing nansum with axis = (0, 1, 2) ...
nansum (old kernel): GPU:45635.562 us +/-28.171 (min:45462.914 / max:45686.687) us
nansum (CUB device): (CUB device-wide reduction not available)
nansum (CUB blocks): GPU: 1498.623 us +/-18.228 (min: 1372.832 / max: 1516.384) us
testing nanprod with axis = (2,) ...
nanprod (old kernel): GPU: 4546.808 us +/-28.821 (min: 4381.056 / max: 4594.432) us
nanprod (CUB device): (CUB device-wide reduction not available)
nanprod (CUB blocks): GPU: 1495.981 us +/-16.283 (min: 1340.576 / max: 1527.840) us
testing nanprod with axis = (1, 2) ...
nanprod (old kernel): GPU: 1100.172 us +/-45.119 (min: 947.264 / max: 1179.648) us
nanprod (CUB device): (CUB device-wide reduction not available)
nanprod (CUB blocks): GPU: 1477.601 us +/-28.550 (min: 1315.200 / max: 1511.072) us
testing nanprod with axis = (0, 1, 2) ...
nanprod (old kernel): GPU:45632.190 us +/-30.316 (min:45465.408 / max:45693.760) us
nanprod (CUB device): (CUB device-wide reduction not available)
nanprod (CUB blocks): GPU: 1496.186 us +/-22.149 (min: 1370.848 / max: 1511.776) us
testing all with axis = (2,) ...
all (old kernel): GPU: 2050.136 us +/- 9.295 (min: 2035.232 / max: 2063.520) us
all (CUB device): (CUB device-wide reduction not available)
all (CUB blocks): GPU: 1095.139 us +/-28.510 (min: 934.304 / max: 1108.512) us
testing all with axis = (1, 2) ...
all (old kernel): GPU: 1098.593 us +/-21.025 (min: 938.016 / max: 1108.864) us
all (CUB device): (CUB device-wide reduction not available)
all (CUB blocks): GPU: 1085.781 us +/-54.378 (min: 939.424 / max: 1133.504) us
testing all with axis = (0, 1, 2) ...
all (old kernel): GPU:42157.684 us +/-42.135 (min:41983.265 / max:42210.655) us
all (CUB device): (CUB device-wide reduction not available)
all (CUB blocks): GPU: 1104.489 us +/-19.050 (min: 948.352 / max: 1146.080) us
testing any with axis = (2,) ...
any (old kernel): GPU: 2059.591 us +/-19.744 (min: 1916.896 / max: 2086.016) us
any (CUB device): (CUB device-wide reduction not available)
any (CUB blocks): GPU: 1092.373 us +/-36.118 (min: 934.976 / max: 1103.616) us
testing any with axis = (1, 2) ...
any (old kernel): GPU: 1100.307 us +/-16.240 (min: 940.064 / max: 1105.600) us
any (CUB device): (CUB device-wide reduction not available)
any (CUB blocks): GPU: 1103.470 us +/-30.003 (min: 944.768 / max: 1122.496) us
testing any with axis = (0, 1, 2) ...
any (old kernel): GPU:42582.782 us +/-35.315 (min:42407.135 / max:42617.569) us
any (CUB device): (CUB device-wide reduction not available)
any (CUB blocks): GPU: 1099.377 us +/-21.385 (min: 950.144 / max: 1117.376) us
testing count_nonzero with axis = (2,) ...
count_nonzero (old kernel): GPU: 2139.850 us +/-10.695 (min: 2125.664 / max: 2181.952) us
count_nonzero (CUB device): (CUB device-wide reduction not available)
count_nonzero (CUB blocks): GPU: 1098.308 us +/-24.435 (min: 939.808 / max: 1112.288) us
testing count_nonzero with axis = (1, 2) ...
count_nonzero (old kernel): GPU: 1097.295 us +/-28.293 (min: 936.000 / max: 1110.208) us
count_nonzero (CUB device): (CUB device-wide reduction not available)
count_nonzero (CUB blocks): GPU: 1104.680 us +/-17.343 (min: 943.072 / max: 1132.128) us
testing count_nonzero with axis = (0, 1, 2) ...
count_nonzero (old kernel): GPU:43069.612 us +/-28.566 (min:42907.009 / max:43110.111) us
count_nonzero (CUB device): (CUB device-wide reduction not available)
count_nonzero (CUB blocks): GPU: 1101.698 us +/-15.388 (min: 958.176 / max: 1151.520) us
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
CUDA 9.2 + P100:
("old kernel" is CuPy's original implementation, "CUB device" uses cupy.cuda.cub if available, and "CUB blocks" refers to this PR.)