garanews · July 1, 2019 10:53
diff --git a/dask_cupy_kern.py b/dask_cupy_kern.py
 import time
 import dask.array as da
 import dask.dataframe as dd
 import numpy as np
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
 import cupy as cp

 ruzicka_kernel = cp.RawKernel(r'''
 extern "C" __global__
 void my_ruzicka(const unsigned short* x1, const unsigned short* x2, float* y, int nrow) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    float max_ab = 0.0;
    float min_ab = 0.0;
    int tmp = 0;

    if(idx < nrow){
        for(int x = 0; x < 1024; x++){
            tmp = x1[idx*1024+x] * (1024-x);
            if(x2[x]>tmp){
                max_ab = max_ab + x2[x];
                min_ab = min_ab + tmp;
            }else{
                min_ab = min_ab + x2[x];
                max_ab = max_ab + tmp;
            }
        }
        y[idx] = min_ab / max_ab;
    }
 }
 ''', 'my_ruzicka')


 def ruzicka_retval(a, b):
    pd_size = len(a)
    y = cp.zeros(CHUNKSIZE, dtype=cp.float32).reshape(1, CHUNKSIZE)
    ruzicka_kernel((pd_size, ),(1024,), (a, b, y, pd_size))
    return y


 CHUNKSIZE = 1024


 if __name__ == "__main__":
    cluster = LocalCUDACluster()
    client = Client(cluster)
    # GENERATE ONE RANDOM SAMPLE TO IDENTIFY
    vector_new = cp.array(np.random.choice([0, 1], 1024), dtype=cp.uint16) * cp.arange(1023, -1, -1, dtype=cp.uint16)
    start = time.time()

    # SIMULATE MATRIX OF RANDOM SAMPLES TO COMPARE (starting with 10k, but objective is having like 100M) the matrix is multiplied with same method of the previous array but done in C for speed.
    a = np.random.choice(a=[1, 1], size=(10240, 1024))
    d_da = da.from_array(a, chunks=(1024, 1024))

    
    res = d_da.map_blocks(lambda df: ruzicka_retval(cp.array(df), vector_new), dtype=cp.float32).compute()


    # WORKING without DASK
 #    y = cp.zeros(len(d_da), dtype=cp.float32)
 #    ruzicka_kernel((len(d_da), ), (1024, ), (cp.array(d_da, dtype=cp.float32), vector_new, y, len(d_da)))

    print("END:", time.time()-start)
    print(res)
	import time
	import dask.array as da
	import dask.dataframe as dd
	import numpy as np
	from dask.distributed import Client
	from dask_cuda import LocalCUDACluster
	import cupy as cp

	ruzicka_kernel = cp.RawKernel(r'''
	extern "C" __global__
	void my_ruzicka(const unsigned short* x1, const unsigned short* x2, float* y, int nrow) {
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	float max_ab = 0.0;
	float min_ab = 0.0;
	int tmp = 0;

	if(idx < nrow){
	for(int x = 0; x < 1024; x++){
	tmp = x1[idx1024+x] (1024-x);
	if(x2[x]>tmp){
	max_ab = max_ab + x2[x];
	min_ab = min_ab + tmp;
	}else{
	min_ab = min_ab + x2[x];
	max_ab = max_ab + tmp;
	}
	}
	y[idx] = min_ab / max_ab;
	}
	}
	''', 'my_ruzicka')


	def ruzicka_retval(a, b):
	pd_size = len(a)
	y = cp.zeros(CHUNKSIZE, dtype=cp.float32).reshape(1, CHUNKSIZE)
	ruzicka_kernel((pd_size, ),(1024,), (a, b, y, pd_size))
	return y


	CHUNKSIZE = 1024


	if __name__ == "__main__":
	cluster = LocalCUDACluster()
	client = Client(cluster)
	# GENERATE ONE RANDOM SAMPLE TO IDENTIFY
	vector_new = cp.array(np.random.choice([0, 1], 1024), dtype=cp.uint16) * cp.arange(1023, -1, -1, dtype=cp.uint16)
	start = time.time()

	# SIMULATE MATRIX OF RANDOM SAMPLES TO COMPARE (starting with 10k, but objective is having like 100M) the matrix is multiplied with same method of the previous array but done in C for speed.
	a = np.random.choice(a=[1, 1], size=(10240, 1024))
	d_da = da.from_array(a, chunks=(1024, 1024))


	res = d_da.map_blocks(lambda df: ruzicka_retval(cp.array(df), vector_new), dtype=cp.float32).compute()


	# WORKING without DASK
	# y = cp.zeros(len(d_da), dtype=cp.float32)
	# ruzicka_kernel((len(d_da), ), (1024, ), (cp.array(d_da, dtype=cp.float32), vector_new, y, len(d_da)))

	print("END:", time.time()-start)
	print(res)