Last active
August 29, 2015 14:01
-
-
Save kohnakagawa/cba94a8127202ad7c209 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <cstdlib> | |
| #include <sys/time.h> | |
| #define BLOCK_SIZE 256 | |
| #define GRID_SIZE 500 | |
| #define SYS_SIZE (BLOCK_SIZE * GRID_SIZE) | |
| double Uniform(){ | |
| return ((double)rand()+1.0)/((double)RAND_MAX+2.0); | |
| } | |
| //no unroll | |
| __global__ void test_kernel1(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){ | |
| const int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
| const float3 ri = {x[tid],y[tid],z[tid]}; | |
| float3 sumF = {0.0, 0.0, 0.0}; | |
| for(int i=0; i<SYS_SIZE; i++){ | |
| const float3 rj = {x[i],y[i],z[i]}; | |
| const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z}; | |
| const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; | |
| const int crit = (dr2 < 1.0); | |
| const float inv_dr = rsqrt(dr2); | |
| const float dist_m_one = inv_dr - 1.0; | |
| const float cf = crit * dist_m_one; | |
| const float3 dF = {cf * dr.x, | |
| cf * dr.y, | |
| cf * dr.z}; | |
| sumF.x += dF.x; | |
| sumF.y += dF.y; | |
| sumF.z += dF.z; | |
| } | |
| F[tid].x = sumF.x; | |
| F[tid].y = sumF.y; | |
| F[tid].z = sumF.z; | |
| } | |
| //2 unroll | |
| __global__ void test_kernel2(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){ | |
| const int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
| const float3 ri = {x[tid],y[tid],z[tid]}; | |
| float3 sumF[2] = {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}}; | |
| for(int i=0; i<SYS_SIZE; i+=2){ | |
| const float3 rj[2] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]}}; | |
| const float3 dr[2] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z}, | |
| {ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z}}; | |
| const float dr2[2] = {dr[0].x*dr[0].x + dr[0].y*dr[0].y + dr[0].z*dr[0].z, | |
| dr[1].x*dr[1].x + dr[1].y*dr[1].y + dr[1].z*dr[1].z}; | |
| const int crit[2] = {(dr2[0] < 1.0),(dr2[1] < 1.0)}; | |
| const float inv_dr[2] = {rsqrt(dr2[0]),rsqrt(dr2[1])}; | |
| const float dist_m_one[2] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0}; | |
| const float cf[2] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1]}; | |
| const float3 dF[2] = {{cf[0]*dr[0].x, cf[0]*dr[0].y, cf[0]*dr[0].z}, | |
| {cf[1]*dr[1].x, cf[1]*dr[1].y, cf[1]*dr[1].z}}; | |
| sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z; | |
| sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z; | |
| } | |
| F[tid].x = sumF[0].x + sumF[1].x; | |
| F[tid].y = sumF[0].y + sumF[1].y; | |
| F[tid].z = sumF[0].z + sumF[1].z; | |
| } | |
| //4 unroll | |
| __global__ void test_kernel3(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){ | |
| const int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
| const float3 ri = {x[tid],y[tid],z[tid]}; | |
| float3 sumF[4] = {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}}; | |
| for(int i=0; i<SYS_SIZE; i+=4){ | |
| const float3 rj[4] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]},{x[i+2],y[i+2],z[i+2]},{x[i+3],y[i+3],z[i+3]}}; | |
| const float3 dr[4] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z}, | |
| {ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z}, | |
| {ri.x - rj[2].x, ri.y - rj[2].y, ri.z - rj[2].z}, | |
| {ri.x - rj[3].x, ri.y - rj[3].y, ri.z - rj[3].z}}; | |
| const float dr2[4] = {dr[0].x*dr[0].x + dr[0].y*dr[0].y + dr[0].z*dr[0].z, | |
| dr[1].x*dr[1].x + dr[1].y*dr[1].y + dr[1].z*dr[1].z, | |
| dr[2].x*dr[2].x + dr[2].y*dr[2].y + dr[2].z*dr[2].z, | |
| dr[3].x*dr[3].x + dr[3].y*dr[3].y + dr[3].z*dr[3].z}; | |
| const int crit[4] = {(dr2[0] < 1.0),(dr2[1] < 1.0),(dr2[2] < 1.0),(dr2[3] < 1.0)}; | |
| const float inv_dr[4] = {rsqrt(dr2[0]),rsqrt(dr2[1]),rsqrt(dr2[2]),rsqrt(dr2[3])}; | |
| const float dist_m_one[4] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0,inv_dr[2] - 1.0,inv_dr[3] - 1.0}; | |
| const float cf[4] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1],crit[2] * dist_m_one[2],crit[3] * dist_m_one[3]}; | |
| const float3 dF[4] = {{cf[0]*dr[0].x, cf[0]*dr[0].y, cf[0]*dr[0].z}, | |
| {cf[1]*dr[1].x, cf[1]*dr[1].y, cf[1]*dr[1].z}, | |
| {cf[2]*dr[2].x, cf[2]*dr[2].y, cf[2]*dr[2].z}, | |
| {cf[3]*dr[3].x, cf[3]*dr[3].y, cf[3]*dr[3].z}}; | |
| sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z; | |
| sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z; | |
| sumF[2].x += dF[2].x;sumF[2].y += dF[2].y;sumF[2].z += dF[2].z; | |
| sumF[3].x += dF[3].x;sumF[3].y += dF[3].y;sumF[3].z += dF[3].z; | |
| } | |
| F[tid].x = sumF[0].x + sumF[1].x + sumF[2].x + sumF[3].x; | |
| F[tid].y = sumF[0].y + sumF[1].y + sumF[2].y + sumF[3].y; | |
| F[tid].z = sumF[0].z + sumF[1].z + sumF[2].z + sumF[3].z; | |
| } | |
| __global__ void test_kernel4(const float4* __restrict__ r,float3* __restrict__ F){ | |
| const int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
| const float4 ri = r[tid]; | |
| float3 sumF = {0.0, 0.0, 0.0}; | |
| for(int i=0; i<SYS_SIZE; i++){ | |
| const float4 rj = r[i]; | |
| const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z}; | |
| const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; | |
| const int crit = (dr2 < 1.0); | |
| const float inv_dr = rsqrt(dr2); | |
| const float dist_m_one = inv_dr - 1.0; | |
| const float cf = crit * dist_m_one; | |
| const float3 dF = {cf * dr.x, | |
| cf * dr.y, | |
| cf * dr.z}; | |
| sumF.x += dF.x; | |
| sumF.y += dF.y; | |
| sumF.z += dF.z; | |
| } | |
| F[tid].x = sumF.x; | |
| F[tid].y = sumF.y; | |
| F[tid].z = sumF.z; | |
| } | |
| __global__ void test_kernel5(const float3* __restrict__ r,float3* __restrict__ F){ | |
| const int tid = threadIdx.x + blockIdx.x*blockDim.x; | |
| const float3 ri = r[tid]; | |
| float3 sumF = {0.0, 0.0, 0.0}; | |
| for(int i=0; i<SYS_SIZE; i++){ | |
| const float3 rj = r[i]; | |
| const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z}; | |
| const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; | |
| const int crit = (dr2 < 1.0); | |
| const float inv_dr = rsqrt(dr2); | |
| const float dist_m_one = inv_dr - 1.0; | |
| const float cf = crit * dist_m_one; | |
| const float3 dF = {cf * dr.x, | |
| cf * dr.y, | |
| cf * dr.z}; | |
| sumF.x += dF.x; | |
| sumF.y += dF.y; | |
| sumF.z += dF.z; | |
| } | |
| F[tid].x = sumF.x; | |
| F[tid].y = sumF.y; | |
| F[tid].z = sumF.z; | |
| } | |
| __global__ void test_kernel6(const float4* __restrict__ r,float4* __restrict__ F){ | |
| const int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
| const float4 ri = r[tid]; | |
| float4 sumF = {0.0, 0.0, 0.0}; | |
| for(int i=0; i<SYS_SIZE; i++){ | |
| const float4 rj = r[i]; | |
| const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z}; | |
| const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z; | |
| const int crit = (dr2 < 1.0); | |
| const float inv_dr = rsqrt(dr2); | |
| const float dist_m_one = inv_dr - 1.0; | |
| const float cf = crit * dist_m_one; | |
| const float3 dF = {cf * dr.x, | |
| cf * dr.y, | |
| cf * dr.z}; | |
| sumF.x += dF.x; | |
| sumF.y += dF.y; | |
| sumF.z += dF.z; | |
| } | |
| F[tid] = sumF; | |
| } | |
| double time_dif(struct timeval &tv1, struct timeval &tv2) | |
| { | |
| double delta_t; | |
| delta_t = tv2.tv_sec - tv1.tv_sec + (tv2.tv_usec - tv1.tv_usec) * 1.0e-6; | |
| return delta_t; | |
| } | |
| int main(void){ | |
| struct timeval tv1,tv2; | |
| using namespace std; | |
| cudaSetDevice(0); | |
| float *x_d,*x_h; | |
| float *y_d,*y_h; | |
| float *z_d,*z_h; | |
| float4 *r4_d,*r4_h; | |
| float3 *r3_d,*r3_h; | |
| float3 *F3_d; | |
| float4 *F4_d; | |
| x_h = new float [SYS_SIZE]; | |
| y_h = new float [SYS_SIZE]; | |
| z_h = new float [SYS_SIZE]; | |
| r4_h = new float4 [SYS_SIZE]; | |
| r3_h = new float3 [SYS_SIZE]; | |
| cudaMalloc(&x_d,SYS_SIZE*sizeof(float)); | |
| cudaMalloc(&y_d,SYS_SIZE*sizeof(float)); | |
| cudaMalloc(&z_d,SYS_SIZE*sizeof(float)); | |
| cudaMalloc(&r4_d,SYS_SIZE*sizeof(float4)); | |
| cudaMalloc(&r3_d,SYS_SIZE*sizeof(float3)); | |
| cudaMalloc(&F3_d,SYS_SIZE*sizeof(float3)); | |
| cudaMalloc(&F4_d,SYS_SIZE*sizeof(float4)); | |
| for(int i=0; i<SYS_SIZE; i++){ | |
| x_h[i] = Uniform(); | |
| y_h[i] = Uniform(); | |
| z_h[i] = Uniform(); | |
| r4_h[i].x = Uniform(); | |
| r4_h[i].y = Uniform(); | |
| r4_h[i].z = Uniform(); | |
| r4_h[i].w = 0.0; | |
| r3_h[i].x = Uniform(); | |
| r3_h[i].y = Uniform(); | |
| r3_h[i].z = Uniform(); | |
| } | |
| cudaMemcpy(x_d,x_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice); | |
| cudaMemcpy(y_d,y_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice); | |
| cudaMemcpy(z_d,z_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice); | |
| cudaMemcpy(r4_d,r4_h,SYS_SIZE*sizeof(float4),cudaMemcpyHostToDevice); | |
| cudaMemcpy(r3_d,r3_h,SYS_SIZE*sizeof(float3),cudaMemcpyHostToDevice); | |
| gettimeofday(&tv1,NULL); | |
| test_kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d); | |
| cudaDeviceSynchronize(); | |
| gettimeofday(&tv2,NULL); | |
| cout << time_dif(tv1,tv2) << endl; | |
| gettimeofday(&tv1,NULL); | |
| test_kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d); | |
| cudaDeviceSynchronize(); | |
| gettimeofday(&tv2,NULL); | |
| cout << time_dif(tv1,tv2) << endl; | |
| gettimeofday(&tv1,NULL); | |
| test_kernel3<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d); | |
| cudaDeviceSynchronize(); | |
| gettimeofday(&tv2,NULL); | |
| cout << time_dif(tv1,tv2) << endl; | |
| gettimeofday(&tv1,NULL); | |
| test_kernel4<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F3_d); | |
| cudaDeviceSynchronize(); | |
| gettimeofday(&tv2,NULL); | |
| cout << time_dif(tv1,tv2) << endl; | |
| gettimeofday(&tv1,NULL); | |
| test_kernel5<<<GRID_SIZE,BLOCK_SIZE>>>(r3_d,F3_d); | |
| cudaDeviceSynchronize(); | |
| gettimeofday(&tv2,NULL); | |
| cout << time_dif(tv1,tv2) << endl; | |
| gettimeofday(&tv1,NULL); | |
| test_kernel6<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F4_d); | |
| cudaDeviceSynchronize(); | |
| gettimeofday(&tv2,NULL); | |
| cout << time_dif(tv1,tv2) << endl; | |
| delete [] x_h; | |
| delete [] y_h; | |
| delete [] z_h; | |
| delete [] r4_h; | |
| delete [] r3_h; | |
| cudaFree(x_d); | |
| cudaFree(y_d); | |
| cudaFree(z_d); | |
| cudaFree(r4_d); | |
| cudaFree(r3_d); | |
| cudaFree(F3_d); | |
| cudaFree(F4_d); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment