Created
April 21, 2019 21:29
-
-
Save maedoc/2414fb534fa7ce7561071ce87113987e to your computer and use it in GitHub Desktop.
recurrence analysis with Numba & CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Implements recurrence analysis with CUDA. | |
| """ | |
| import os, glob | |
| # not needed if using conda | |
| os.environ['NUMBAPRO_NVVM'], = glob.glob('c:/program files/nvidia*/cuda/*/nvvm/bin/nvvm64*.dll') | |
| os.environ['NUMBAPRO_LIBDEVICE'], = glob.glob('c:/program files/nvidia*/cuda/*/nvvm/libdevice') | |
| import math | |
| import numpy as np | |
| from numba import cuda | |
| import numba as nb | |
| cu32 = lambda a: np.ascontiguousarray(a.astype(np.uint32)) | |
| @cuda.jit | |
| def _sseij(Y, I, J, O): | |
| # strides | |
| sty = cuda.blockDim.x | |
| sbx = sty * cuda.blockDim.y | |
| sby = sbx * cuda.gridDim.x | |
| sbz = sby * cuda.gridDim.y | |
| # this thread's index | |
| t = (cuda.threadIdx.x | |
| + cuda.threadIdx.y * sty | |
| + cuda.blockIdx.x * sbx | |
| + cuda.blockIdx.y * sby | |
| + cuda.blockIdx.z * sbz) | |
| if t < I.size: | |
| i = I[t] | |
| j = J[t] | |
| x = nb.float32(0.0) | |
| for k in range(Y.shape[0]): | |
| x += (Y[k, i] - Y[k, j])**2 | |
| O[t] = math.sqrt(x) | |
| def ra(Y): | |
| """Perform recurrence analysis for series of matrices | |
| Y size (n_mat, m, n). | |
| """ | |
| Y = Y.astype('f') | |
| nw, ns, nf = Y.shape | |
| # simd friendly form | |
| Y_ = np.ascontiguousarray( | |
| Y.reshape((nw, -1)).T) | |
| assert Y_.strides == (nw*4, 4) # (ns*nf, nw) | |
| i, j = map(cu32, np.tril_indices(nw)) | |
| o = np.zeros((i.size, ), 'f') | |
| ntt = o.size | |
| ngz = (ntt // (32**4)) + 1 | |
| grid, block = (32, 32, ngz), (32, 32) | |
| assert ntt < (np.prod(grid)*np.prod(block)) | |
| print('hold on, launching...') | |
| _sseij[grid, block](Y_, i, j, o) | |
| print('kernel done') | |
| # compute dense output | |
| O = np.zeros((nw, nw), 'f') | |
| O[i, j] = o | |
| O += O.T | |
| return O |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment