Skip to content

Instantly share code, notes, and snippets.

@kohnakagawa
Last active August 29, 2015 14:01
Show Gist options
  • Save kohnakagawa/cba94a8127202ad7c209 to your computer and use it in GitHub Desktop.
Save kohnakagawa/cba94a8127202ad7c209 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <cstdlib>
#include <sys/time.h>
#define BLOCK_SIZE 256
#define GRID_SIZE 500
#define SYS_SIZE (BLOCK_SIZE * GRID_SIZE)
double Uniform(){
return ((double)rand()+1.0)/((double)RAND_MAX+2.0);
}
//no unroll
__global__ void test_kernel1(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const float3 ri = {x[tid],y[tid],z[tid]};
float3 sumF = {0.0, 0.0, 0.0};
for(int i=0; i<SYS_SIZE; i++){
const float3 rj = {x[i],y[i],z[i]};
const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
const int crit = (dr2 < 1.0);
const float inv_dr = rsqrt(dr2);
const float dist_m_one = inv_dr - 1.0;
const float cf = crit * dist_m_one;
const float3 dF = {cf * dr.x,
cf * dr.y,
cf * dr.z};
sumF.x += dF.x;
sumF.y += dF.y;
sumF.z += dF.z;
}
F[tid].x = sumF.x;
F[tid].y = sumF.y;
F[tid].z = sumF.z;
}
//2 unroll
__global__ void test_kernel2(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const float3 ri = {x[tid],y[tid],z[tid]};
float3 sumF[2] = {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}};
for(int i=0; i<SYS_SIZE; i+=2){
const float3 rj[2] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]}};
const float3 dr[2] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z},
{ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z}};
const float dr2[2] = {dr[0].x*dr[0].x + dr[0].y*dr[0].y + dr[0].z*dr[0].z,
dr[1].x*dr[1].x + dr[1].y*dr[1].y + dr[1].z*dr[1].z};
const int crit[2] = {(dr2[0] < 1.0),(dr2[1] < 1.0)};
const float inv_dr[2] = {rsqrt(dr2[0]),rsqrt(dr2[1])};
const float dist_m_one[2] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0};
const float cf[2] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1]};
const float3 dF[2] = {{cf[0]*dr[0].x, cf[0]*dr[0].y, cf[0]*dr[0].z},
{cf[1]*dr[1].x, cf[1]*dr[1].y, cf[1]*dr[1].z}};
sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z;
sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z;
}
F[tid].x = sumF[0].x + sumF[1].x;
F[tid].y = sumF[0].y + sumF[1].y;
F[tid].z = sumF[0].z + sumF[1].z;
}
//4 unroll
__global__ void test_kernel3(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const float3 ri = {x[tid],y[tid],z[tid]};
float3 sumF[4] = {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}};
for(int i=0; i<SYS_SIZE; i+=4){
const float3 rj[4] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]},{x[i+2],y[i+2],z[i+2]},{x[i+3],y[i+3],z[i+3]}};
const float3 dr[4] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z},
{ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z},
{ri.x - rj[2].x, ri.y - rj[2].y, ri.z - rj[2].z},
{ri.x - rj[3].x, ri.y - rj[3].y, ri.z - rj[3].z}};
const float dr2[4] = {dr[0].x*dr[0].x + dr[0].y*dr[0].y + dr[0].z*dr[0].z,
dr[1].x*dr[1].x + dr[1].y*dr[1].y + dr[1].z*dr[1].z,
dr[2].x*dr[2].x + dr[2].y*dr[2].y + dr[2].z*dr[2].z,
dr[3].x*dr[3].x + dr[3].y*dr[3].y + dr[3].z*dr[3].z};
const int crit[4] = {(dr2[0] < 1.0),(dr2[1] < 1.0),(dr2[2] < 1.0),(dr2[3] < 1.0)};
const float inv_dr[4] = {rsqrt(dr2[0]),rsqrt(dr2[1]),rsqrt(dr2[2]),rsqrt(dr2[3])};
const float dist_m_one[4] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0,inv_dr[2] - 1.0,inv_dr[3] - 1.0};
const float cf[4] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1],crit[2] * dist_m_one[2],crit[3] * dist_m_one[3]};
const float3 dF[4] = {{cf[0]*dr[0].x, cf[0]*dr[0].y, cf[0]*dr[0].z},
{cf[1]*dr[1].x, cf[1]*dr[1].y, cf[1]*dr[1].z},
{cf[2]*dr[2].x, cf[2]*dr[2].y, cf[2]*dr[2].z},
{cf[3]*dr[3].x, cf[3]*dr[3].y, cf[3]*dr[3].z}};
sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z;
sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z;
sumF[2].x += dF[2].x;sumF[2].y += dF[2].y;sumF[2].z += dF[2].z;
sumF[3].x += dF[3].x;sumF[3].y += dF[3].y;sumF[3].z += dF[3].z;
}
F[tid].x = sumF[0].x + sumF[1].x + sumF[2].x + sumF[3].x;
F[tid].y = sumF[0].y + sumF[1].y + sumF[2].y + sumF[3].y;
F[tid].z = sumF[0].z + sumF[1].z + sumF[2].z + sumF[3].z;
}
__global__ void test_kernel4(const float4* __restrict__ r,float3* __restrict__ F){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const float4 ri = r[tid];
float3 sumF = {0.0, 0.0, 0.0};
for(int i=0; i<SYS_SIZE; i++){
const float4 rj = r[i];
const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
const int crit = (dr2 < 1.0);
const float inv_dr = rsqrt(dr2);
const float dist_m_one = inv_dr - 1.0;
const float cf = crit * dist_m_one;
const float3 dF = {cf * dr.x,
cf * dr.y,
cf * dr.z};
sumF.x += dF.x;
sumF.y += dF.y;
sumF.z += dF.z;
}
F[tid].x = sumF.x;
F[tid].y = sumF.y;
F[tid].z = sumF.z;
}
__global__ void test_kernel5(const float3* __restrict__ r,float3* __restrict__ F){
const int tid = threadIdx.x + blockIdx.x*blockDim.x;
const float3 ri = r[tid];
float3 sumF = {0.0, 0.0, 0.0};
for(int i=0; i<SYS_SIZE; i++){
const float3 rj = r[i];
const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
const int crit = (dr2 < 1.0);
const float inv_dr = rsqrt(dr2);
const float dist_m_one = inv_dr - 1.0;
const float cf = crit * dist_m_one;
const float3 dF = {cf * dr.x,
cf * dr.y,
cf * dr.z};
sumF.x += dF.x;
sumF.y += dF.y;
sumF.z += dF.z;
}
F[tid].x = sumF.x;
F[tid].y = sumF.y;
F[tid].z = sumF.z;
}
__global__ void test_kernel6(const float4* __restrict__ r,float4* __restrict__ F){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const float4 ri = r[tid];
float4 sumF = {0.0, 0.0, 0.0};
for(int i=0; i<SYS_SIZE; i++){
const float4 rj = r[i];
const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
const int crit = (dr2 < 1.0);
const float inv_dr = rsqrt(dr2);
const float dist_m_one = inv_dr - 1.0;
const float cf = crit * dist_m_one;
const float3 dF = {cf * dr.x,
cf * dr.y,
cf * dr.z};
sumF.x += dF.x;
sumF.y += dF.y;
sumF.z += dF.z;
}
F[tid] = sumF;
}
double time_dif(struct timeval &tv1, struct timeval &tv2)
{
double delta_t;
delta_t = tv2.tv_sec - tv1.tv_sec + (tv2.tv_usec - tv1.tv_usec) * 1.0e-6;
return delta_t;
}
int main(void){
struct timeval tv1,tv2;
using namespace std;
cudaSetDevice(0);
float *x_d,*x_h;
float *y_d,*y_h;
float *z_d,*z_h;
float4 *r4_d,*r4_h;
float3 *r3_d,*r3_h;
float3 *F3_d;
float4 *F4_d;
x_h = new float [SYS_SIZE];
y_h = new float [SYS_SIZE];
z_h = new float [SYS_SIZE];
r4_h = new float4 [SYS_SIZE];
r3_h = new float3 [SYS_SIZE];
cudaMalloc(&x_d,SYS_SIZE*sizeof(float));
cudaMalloc(&y_d,SYS_SIZE*sizeof(float));
cudaMalloc(&z_d,SYS_SIZE*sizeof(float));
cudaMalloc(&r4_d,SYS_SIZE*sizeof(float4));
cudaMalloc(&r3_d,SYS_SIZE*sizeof(float3));
cudaMalloc(&F3_d,SYS_SIZE*sizeof(float3));
cudaMalloc(&F4_d,SYS_SIZE*sizeof(float4));
for(int i=0; i<SYS_SIZE; i++){
x_h[i] = Uniform();
y_h[i] = Uniform();
z_h[i] = Uniform();
r4_h[i].x = Uniform();
r4_h[i].y = Uniform();
r4_h[i].z = Uniform();
r4_h[i].w = 0.0;
r3_h[i].x = Uniform();
r3_h[i].y = Uniform();
r3_h[i].z = Uniform();
}
cudaMemcpy(x_d,x_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(y_d,y_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(z_d,z_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(r4_d,r4_h,SYS_SIZE*sizeof(float4),cudaMemcpyHostToDevice);
cudaMemcpy(r3_d,r3_h,SYS_SIZE*sizeof(float3),cudaMemcpyHostToDevice);
gettimeofday(&tv1,NULL);
test_kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
cudaDeviceSynchronize();
gettimeofday(&tv2,NULL);
cout << time_dif(tv1,tv2) << endl;
gettimeofday(&tv1,NULL);
test_kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
cudaDeviceSynchronize();
gettimeofday(&tv2,NULL);
cout << time_dif(tv1,tv2) << endl;
gettimeofday(&tv1,NULL);
test_kernel3<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
cudaDeviceSynchronize();
gettimeofday(&tv2,NULL);
cout << time_dif(tv1,tv2) << endl;
gettimeofday(&tv1,NULL);
test_kernel4<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F3_d);
cudaDeviceSynchronize();
gettimeofday(&tv2,NULL);
cout << time_dif(tv1,tv2) << endl;
gettimeofday(&tv1,NULL);
test_kernel5<<<GRID_SIZE,BLOCK_SIZE>>>(r3_d,F3_d);
cudaDeviceSynchronize();
gettimeofday(&tv2,NULL);
cout << time_dif(tv1,tv2) << endl;
gettimeofday(&tv1,NULL);
test_kernel6<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F4_d);
cudaDeviceSynchronize();
gettimeofday(&tv2,NULL);
cout << time_dif(tv1,tv2) << endl;
delete [] x_h;
delete [] y_h;
delete [] z_h;
delete [] r4_h;
delete [] r3_h;
cudaFree(x_d);
cudaFree(y_d);
cudaFree(z_d);
cudaFree(r4_d);
cudaFree(r3_d);
cudaFree(F3_d);
cudaFree(F4_d);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment