Created
November 8, 2012 03:57
-
-
Save fasiha/4036693 to your computer and use it in GitHub Desktop.
Compare performance of using host-registered pinned and unpinned host memory, version 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Compare performance of using host-registered pinned and unpinned host memory, | |
with more than one block for larger arrays, and with unpinned tried first. | |
""" | |
import numpy as np | |
import pycuda.autoinit | |
import pycuda.driver as drv | |
from pycuda.compiler import SourceModule | |
from time import time | |
increment_mod = SourceModule(""" | |
__global__ void increment(double *a, int N) | |
{ | |
int idx = threadIdx.x + blockIdx.x*blockDim.x; | |
if (idx < N) | |
a[idx] = a[idx]+1; | |
} | |
""") | |
increment = increment_mod.get_function("increment") | |
N = 23 # breaks. Works if <= 22 | |
M = 3 | |
# Time use of pageable host memory: | |
x = np.empty((N, N), np.float64) | |
times = np.empty(M) | |
for i in xrange(M): | |
x[:, :] = np.random.rand(N, N) | |
x_orig = x.copy() | |
start = time() | |
increment(drv.InOut(x), np.uint32(x.size), block=(512, 1, 1),grid=(int(np.ceil(N*N/float(512))),1,1)) | |
times[i] = time()-start | |
assert np.allclose(x_orig + 1, x) | |
print "Average kernel execution time with pageable memory: %3.7f" % np.mean(times) | |
# Time use of pinned host memory: | |
x = drv.aligned_empty((N, N), dtype=np.float64, order='C') | |
x = drv.register_host_memory(x, flags=drv.mem_host_register_flags.DEVICEMAP) | |
x_gpu_ptr = np.intp(x.base.get_device_pointer()) | |
times = np.empty(M) | |
for i in xrange(M): | |
x[:, :] = np.random.rand(N, N) | |
x_orig = x.copy() | |
start = time() | |
increment(x_gpu_ptr, np.uint32(x.size), block=(512, 1, 1), grid=(int(np.ceil(N*N/float(512))),1,1)) | |
times[i] = time()-start | |
assert np.allclose(x_orig + 1, x) | |
print "Average kernel execution time with pinned memory: %3.7f" % np.mean(times) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think messing with the "alignment" keyword to aligned_empty() should make this problem go away, per Lev's comment on the PyCUDA mailing list, but I couldn't get that to work. In the end I just used pre-allocated memory per https://gist.github.com/4036297. Thanks Lev.