maedoc · April 21, 2019 21:29
diff --git a/cudara.py b/cudara.py
 """Implements recurrence analysis with CUDA.
 """

 import os, glob
 # not needed if using conda
 os.environ['NUMBAPRO_NVVM'], = glob.glob('c:/program files/nvidia*/cuda/*/nvvm/bin/nvvm64*.dll')
 os.environ['NUMBAPRO_LIBDEVICE'], = glob.glob('c:/program files/nvidia*/cuda/*/nvvm/libdevice')
 import math
 import numpy as np
 from numba import cuda
 import numba as nb

 cu32 = lambda a: np.ascontiguousarray(a.astype(np.uint32))

 @cuda.jit
 def _sseij(Y, I, J, O):
    # strides
    sty = cuda.blockDim.x
    sbx = sty * cuda.blockDim.y
    sby = sbx * cuda.gridDim.x
    sbz = sby * cuda.gridDim.y
    # this thread's index
    t = (cuda.threadIdx.x
       + cuda.threadIdx.y * sty
       + cuda.blockIdx.x * sbx
       + cuda.blockIdx.y * sby
       + cuda.blockIdx.z * sbz)
    if t < I.size:
        i = I[t]
        j = J[t]
        x = nb.float32(0.0)
        for k in range(Y.shape[0]):
            x += (Y[k, i] - Y[k, j])**2
        O[t] = math.sqrt(x)


 def ra(Y):
    """Perform recurrence analysis for series of matrices 
    Y size (n_mat, m, n).
    """
    Y = Y.astype('f')
    nw, ns, nf = Y.shape
    # simd friendly form
    Y_ = np.ascontiguousarray(
        Y.reshape((nw, -1)).T)
    assert Y_.strides == (nw*4, 4)  # (ns*nf, nw)
    i, j = map(cu32, np.tril_indices(nw))
    o = np.zeros((i.size, ), 'f')
    ntt = o.size
    ngz = (ntt // (32**4)) + 1
    grid, block = (32, 32, ngz), (32, 32)
    assert ntt < (np.prod(grid)*np.prod(block))
    print('hold on, launching...')
    _sseij[grid, block](Y_, i, j, o)
    print('kernel done')
    # compute dense output
    O = np.zeros((nw, nw), 'f')
    O[i, j] = o
    O += O.T
    return O
	"""Implements recurrence analysis with CUDA.
	"""

	import os, glob
	# not needed if using conda
	os.environ['NUMBAPRO_NVVM'], = glob.glob('c:/program files/nvidia/cuda//nvvm/bin/nvvm64*.dll')
	os.environ['NUMBAPRO_LIBDEVICE'], = glob.glob('c:/program files/nvidia/cuda//nvvm/libdevice')
	import math
	import numpy as np
	from numba import cuda
	import numba as nb

	cu32 = lambda a: np.ascontiguousarray(a.astype(np.uint32))

	@cuda.jit
	def _sseij(Y, I, J, O):
	# strides
	sty = cuda.blockDim.x
	sbx = sty * cuda.blockDim.y
	sby = sbx * cuda.gridDim.x
	sbz = sby * cuda.gridDim.y
	# this thread's index
	t = (cuda.threadIdx.x
	+ cuda.threadIdx.y * sty
	+ cuda.blockIdx.x * sbx
	+ cuda.blockIdx.y * sby
	+ cuda.blockIdx.z * sbz)
	if t < I.size:
	i = I[t]
	j = J[t]
	x = nb.float32(0.0)
	for k in range(Y.shape[0]):
	x += (Y[k, i] - Y[k, j])**2
	O[t] = math.sqrt(x)


	def ra(Y):
	"""Perform recurrence analysis for series of matrices
	Y size (n_mat, m, n).
	"""
	Y = Y.astype('f')
	nw, ns, nf = Y.shape
	# simd friendly form
	Y_ = np.ascontiguousarray(
	Y.reshape((nw, -1)).T)
	assert Y_.strides == (nw4, 4) # (nsnf, nw)
	i, j = map(cu32, np.tril_indices(nw))
	o = np.zeros((i.size, ), 'f')
	ntt = o.size
	ngz = (ntt // (32**4)) + 1
	grid, block = (32, 32, ngz), (32, 32)
	assert ntt < (np.prod(grid)*np.prod(block))
	print('hold on, launching...')
	_sseij[grid, block](Y_, i, j, o)
	print('kernel done')
	# compute dense output
	O = np.zeros((nw, nw), 'f')
	O[i, j] = o
	O += O.T
	return O
No results found