kohnakagawa · August 29, 2015 14:01
diff --git a/test_ilp.cu b/test_ilp.cu
 #include <iostream>
 #include <cstdlib>
 #include <sys/time.h>

 #define BLOCK_SIZE 256
 #define GRID_SIZE 500
 #define SYS_SIZE (BLOCK_SIZE * GRID_SIZE)

 double Uniform(){
  return ((double)rand()+1.0)/((double)RAND_MAX+2.0);
 }

 //no unroll
 __global__ void test_kernel1(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const float3 ri = {x[tid],y[tid],z[tid]};
  float3 sumF = {0.0, 0.0, 0.0};
  for(int i=0; i<SYS_SIZE; i++){
    const float3 rj = {x[i],y[i],z[i]};
    const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
    const float  dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
    const int crit = (dr2 < 1.0);
    const float inv_dr = rsqrt(dr2);
    const float dist_m_one = inv_dr - 1.0;
    const float cf = crit * dist_m_one;
    const float3 dF = {cf * dr.x,
 		       cf * dr.y,
 		       cf * dr.z};
    sumF.x += dF.x;
    sumF.y += dF.y;
    sumF.z += dF.z;
  }
  F[tid].x = sumF.x;
  F[tid].y = sumF.y;
  F[tid].z = sumF.z;
 }

 //2 unroll
 __global__ void test_kernel2(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const float3 ri = {x[tid],y[tid],z[tid]};
  float3 sumF[2] =  {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}};
  for(int i=0; i<SYS_SIZE; i+=2){
    const float3 rj[2] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]}};
    const float3 dr[2] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z},
 			  {ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z}};
    const float  dr2[2] = {dr[0].x*dr[0].x + dr[0].y*dr[0].y + dr[0].z*dr[0].z,
 			   dr[1].x*dr[1].x + dr[1].y*dr[1].y + dr[1].z*dr[1].z};
    const int crit[2] = {(dr2[0] < 1.0),(dr2[1] < 1.0)};
    const float inv_dr[2] = {rsqrt(dr2[0]),rsqrt(dr2[1])};
    const float dist_m_one[2] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0};
    const float cf[2] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1]};
    const float3 dF[2] = {{cf[0]*dr[0].x, cf[0]*dr[0].y, cf[0]*dr[0].z},
 			  {cf[1]*dr[1].x, cf[1]*dr[1].y, cf[1]*dr[1].z}};
    sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z;
    sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z;
  }
  F[tid].x = sumF[0].x + sumF[1].x;
  F[tid].y = sumF[0].y + sumF[1].y;
  F[tid].z = sumF[0].z + sumF[1].z;
 }

 //4 unroll
 __global__ void test_kernel3(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const float3 ri = {x[tid],y[tid],z[tid]};
  float3 sumF[4] =  {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}};
  for(int i=0; i<SYS_SIZE; i+=4){
    const float3 rj[4] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]},{x[i+2],y[i+2],z[i+2]},{x[i+3],y[i+3],z[i+3]}};
    const float3 dr[4] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z},
 			  {ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z},
 			  {ri.x - rj[2].x, ri.y - rj[2].y, ri.z - rj[2].z},
 			  {ri.x - rj[3].x, ri.y - rj[3].y, ri.z - rj[3].z}};
    const float  dr2[4] = {dr[0].x*dr[0].x + dr[0].y*dr[0].y + dr[0].z*dr[0].z,
 			   dr[1].x*dr[1].x + dr[1].y*dr[1].y + dr[1].z*dr[1].z,
 			   dr[2].x*dr[2].x + dr[2].y*dr[2].y + dr[2].z*dr[2].z,
 			   dr[3].x*dr[3].x + dr[3].y*dr[3].y + dr[3].z*dr[3].z};
    const int crit[4] = {(dr2[0] < 1.0),(dr2[1] < 1.0),(dr2[2] < 1.0),(dr2[3] < 1.0)};
    const float inv_dr[4] = {rsqrt(dr2[0]),rsqrt(dr2[1]),rsqrt(dr2[2]),rsqrt(dr2[3])};
    const float dist_m_one[4] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0,inv_dr[2] - 1.0,inv_dr[3] - 1.0};
    const float cf[4] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1],crit[2] * dist_m_one[2],crit[3] * dist_m_one[3]};
    const float3 dF[4] = {{cf[0]*dr[0].x, cf[0]*dr[0].y, cf[0]*dr[0].z},
 			  {cf[1]*dr[1].x, cf[1]*dr[1].y, cf[1]*dr[1].z},
 			  {cf[2]*dr[2].x, cf[2]*dr[2].y, cf[2]*dr[2].z},
 			  {cf[3]*dr[3].x, cf[3]*dr[3].y, cf[3]*dr[3].z}};
    sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z;
    sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z;
    sumF[2].x += dF[2].x;sumF[2].y += dF[2].y;sumF[2].z += dF[2].z;
    sumF[3].x += dF[3].x;sumF[3].y += dF[3].y;sumF[3].z += dF[3].z;
  }
  F[tid].x = sumF[0].x + sumF[1].x + sumF[2].x + sumF[3].x;
  F[tid].y = sumF[0].y + sumF[1].y + sumF[2].y + sumF[3].y;
  F[tid].z = sumF[0].z + sumF[1].z + sumF[2].z + sumF[3].z;
 }

 __global__ void test_kernel4(const float4* __restrict__ r,float3* __restrict__ F){
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const float4 ri = r[tid];
  float3 sumF = {0.0, 0.0, 0.0};
  for(int i=0; i<SYS_SIZE; i++){
    const float4 rj = r[i];
    const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
    const float  dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
    const int  crit = (dr2 < 1.0);
    const float inv_dr = rsqrt(dr2);
    const float dist_m_one = inv_dr - 1.0;
    const float cf = crit * dist_m_one;
    const float3 dF = {cf * dr.x,
 		       cf * dr.y,
 		       cf * dr.z};
    sumF.x += dF.x;
    sumF.y += dF.y;
    sumF.z += dF.z;
  }
  F[tid].x = sumF.x;
  F[tid].y = sumF.y;
  F[tid].z = sumF.z;
 }

 __global__ void test_kernel5(const float3* __restrict__ r,float3* __restrict__ F){
  const int tid = threadIdx.x + blockIdx.x*blockDim.x;
  const float3 ri = r[tid];
  float3 sumF = {0.0, 0.0, 0.0};
  for(int i=0; i<SYS_SIZE; i++){
    const float3 rj = r[i];
    const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
    const float  dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
    const int  crit = (dr2 < 1.0);
    const float inv_dr = rsqrt(dr2);
    const float dist_m_one = inv_dr - 1.0;
    const float cf = crit * dist_m_one;
    const float3 dF = {cf * dr.x,
 		       cf * dr.y,
 		       cf * dr.z};
    sumF.x += dF.x;
    sumF.y += dF.y;
    sumF.z += dF.z;
  }
  F[tid].x = sumF.x;
  F[tid].y = sumF.y;
  F[tid].z = sumF.z;
 }

 __global__ void test_kernel6(const float4* __restrict__ r,float4* __restrict__ F){
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const float4 ri = r[tid];
  float4 sumF = {0.0, 0.0, 0.0};
  for(int i=0; i<SYS_SIZE; i++){
    const float4 rj = r[i];
    const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
    const float  dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
    const int  crit = (dr2 < 1.0);
    const float inv_dr = rsqrt(dr2);
    const float dist_m_one = inv_dr - 1.0;
    const float cf = crit * dist_m_one;
    const float3 dF = {cf * dr.x,
 		       cf * dr.y,
 		       cf * dr.z};
    sumF.x += dF.x;
    sumF.y += dF.y;
    sumF.z += dF.z;
  }
  F[tid] = sumF;
 }

 double time_dif(struct timeval &tv1, struct timeval &tv2)
 {
  double delta_t;
  delta_t = tv2.tv_sec - tv1.tv_sec + (tv2.tv_usec - tv1.tv_usec) * 1.0e-6;
  return delta_t;
 }

 int main(void){
  struct timeval tv1,tv2;
  using namespace std;
  cudaSetDevice(0);

  float *x_d,*x_h;
  float *y_d,*y_h;
  float *z_d,*z_h;
  float4 *r4_d,*r4_h;
  float3 *r3_d,*r3_h;
  
  float3 *F3_d;
  float4 *F4_d;
  x_h = new float [SYS_SIZE];
  y_h = new float [SYS_SIZE];
  z_h = new float [SYS_SIZE];
  r4_h = new float4 [SYS_SIZE];
  r3_h = new float3 [SYS_SIZE];
  
  cudaMalloc(&x_d,SYS_SIZE*sizeof(float));
  cudaMalloc(&y_d,SYS_SIZE*sizeof(float));
  cudaMalloc(&z_d,SYS_SIZE*sizeof(float));
  cudaMalloc(&r4_d,SYS_SIZE*sizeof(float4));
  cudaMalloc(&r3_d,SYS_SIZE*sizeof(float3));
  
  cudaMalloc(&F3_d,SYS_SIZE*sizeof(float3));
  cudaMalloc(&F4_d,SYS_SIZE*sizeof(float4));
  
  for(int i=0; i<SYS_SIZE; i++){
    x_h[i] = Uniform();
    y_h[i] = Uniform();
    z_h[i] = Uniform();
    r4_h[i].x = Uniform();
    r4_h[i].y = Uniform();
    r4_h[i].z = Uniform();
    r4_h[i].w = 0.0;
    r3_h[i].x = Uniform();
    r3_h[i].y = Uniform();
    r3_h[i].z = Uniform();
  }
  
  cudaMemcpy(x_d,x_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
  cudaMemcpy(y_d,y_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
  cudaMemcpy(z_d,z_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
  cudaMemcpy(r4_d,r4_h,SYS_SIZE*sizeof(float4),cudaMemcpyHostToDevice);
  cudaMemcpy(r3_d,r3_h,SYS_SIZE*sizeof(float3),cudaMemcpyHostToDevice);
  
  gettimeofday(&tv1,NULL);
  test_kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
  cudaDeviceSynchronize();
  gettimeofday(&tv2,NULL);

  cout << time_dif(tv1,tv2) << endl;

  gettimeofday(&tv1,NULL);
  test_kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
  cudaDeviceSynchronize();
  gettimeofday(&tv2,NULL);
  
  cout << time_dif(tv1,tv2) << endl;
  
  gettimeofday(&tv1,NULL);
  test_kernel3<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
  cudaDeviceSynchronize();
  gettimeofday(&tv2,NULL);
  
  cout << time_dif(tv1,tv2) << endl;

  gettimeofday(&tv1,NULL);
  test_kernel4<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F3_d);
  cudaDeviceSynchronize();
  gettimeofday(&tv2,NULL);
  
  cout << time_dif(tv1,tv2) << endl;

  gettimeofday(&tv1,NULL);
  test_kernel5<<<GRID_SIZE,BLOCK_SIZE>>>(r3_d,F3_d);
  cudaDeviceSynchronize();
  gettimeofday(&tv2,NULL);
  
  cout << time_dif(tv1,tv2) << endl;

  gettimeofday(&tv1,NULL);
  test_kernel6<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F4_d);
  cudaDeviceSynchronize();
  gettimeofday(&tv2,NULL);
  
  cout << time_dif(tv1,tv2) << endl;
  
  delete [] x_h;
  delete [] y_h;
  delete [] z_h;
  delete [] r4_h;
  delete [] r3_h;
  cudaFree(x_d);
  cudaFree(y_d);
  cudaFree(z_d);
  cudaFree(r4_d);
  cudaFree(r3_d);
  cudaFree(F3_d);
  cudaFree(F4_d);
  
  return 0;
 }
	#include <iostream>
	#include <cstdlib>
	#include <sys/time.h>

	#define BLOCK_SIZE 256
	#define GRID_SIZE 500
	#define SYS_SIZE (BLOCK_SIZE * GRID_SIZE)

	double Uniform(){
	return ((double)rand()+1.0)/((double)RAND_MAX+2.0);
	}

	//no unroll
	__global__ void test_kernel1(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
	const int tid = threadIdx.x + blockIdx.x * blockDim.x;
	const float3 ri = {x[tid],y[tid],z[tid]};
	float3 sumF = {0.0, 0.0, 0.0};
	for(int i=0; i<SYS_SIZE; i++){
	const float3 rj = {x[i],y[i],z[i]};
	const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
	const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
	const int crit = (dr2 < 1.0);
	const float inv_dr = rsqrt(dr2);
	const float dist_m_one = inv_dr - 1.0;
	const float cf = crit * dist_m_one;
	const float3 dF = {cf * dr.x,
	cf * dr.y,
	cf * dr.z};
	sumF.x += dF.x;
	sumF.y += dF.y;
	sumF.z += dF.z;
	}
	F[tid].x = sumF.x;
	F[tid].y = sumF.y;
	F[tid].z = sumF.z;
	}

	//2 unroll
	__global__ void test_kernel2(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
	const int tid = threadIdx.x + blockIdx.x * blockDim.x;
	const float3 ri = {x[tid],y[tid],z[tid]};
	float3 sumF[2] = {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}};
	for(int i=0; i<SYS_SIZE; i+=2){
	const float3 rj[2] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]}};
	const float3 dr[2] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z},
	{ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z}};
	const float dr2[2] = {dr[0].xdr[0].x + dr[0].ydr[0].y + dr[0].z*dr[0].z,
	dr[1].xdr[1].x + dr[1].ydr[1].y + dr[1].z*dr[1].z};
	const int crit[2] = {(dr2[0] < 1.0),(dr2[1] < 1.0)};
	const float inv_dr[2] = {rsqrt(dr2[0]),rsqrt(dr2[1])};
	const float dist_m_one[2] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0};
	const float cf[2] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1]};
	const float3 dF[2] = {{cf[0]dr[0].x, cf[0]dr[0].y, cf[0]*dr[0].z},
	{cf[1]dr[1].x, cf[1]dr[1].y, cf[1]*dr[1].z}};
	sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z;
	sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z;
	}
	F[tid].x = sumF[0].x + sumF[1].x;
	F[tid].y = sumF[0].y + sumF[1].y;
	F[tid].z = sumF[0].z + sumF[1].z;
	}

	//4 unroll
	__global__ void test_kernel3(const float* __restrict__ x,const float* __restrict__ y,const float* __restrict__ z,float3* F){
	const int tid = threadIdx.x + blockIdx.x * blockDim.x;
	const float3 ri = {x[tid],y[tid],z[tid]};
	float3 sumF[4] = {{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0},{0.0, 0.0, 0.0}};
	for(int i=0; i<SYS_SIZE; i+=4){
	const float3 rj[4] = {{x[i],y[i],z[i]},{x[i+1],y[i+1],z[i+1]},{x[i+2],y[i+2],z[i+2]},{x[i+3],y[i+3],z[i+3]}};
	const float3 dr[4] = {{ri.x - rj[0].x, ri.y - rj[0].y, ri.z - rj[0].z},
	{ri.x - rj[1].x, ri.y - rj[1].y, ri.z - rj[1].z},
	{ri.x - rj[2].x, ri.y - rj[2].y, ri.z - rj[2].z},
	{ri.x - rj[3].x, ri.y - rj[3].y, ri.z - rj[3].z}};
	const float dr2[4] = {dr[0].xdr[0].x + dr[0].ydr[0].y + dr[0].z*dr[0].z,
	dr[1].xdr[1].x + dr[1].ydr[1].y + dr[1].z*dr[1].z,
	dr[2].xdr[2].x + dr[2].ydr[2].y + dr[2].z*dr[2].z,
	dr[3].xdr[3].x + dr[3].ydr[3].y + dr[3].z*dr[3].z};
	const int crit[4] = {(dr2[0] < 1.0),(dr2[1] < 1.0),(dr2[2] < 1.0),(dr2[3] < 1.0)};
	const float inv_dr[4] = {rsqrt(dr2[0]),rsqrt(dr2[1]),rsqrt(dr2[2]),rsqrt(dr2[3])};
	const float dist_m_one[4] = {inv_dr[0] - 1.0,inv_dr[1] - 1.0,inv_dr[2] - 1.0,inv_dr[3] - 1.0};
	const float cf[4] = {crit[0] * dist_m_one[0],crit[1] * dist_m_one[1],crit[2] * dist_m_one[2],crit[3] * dist_m_one[3]};
	const float3 dF[4] = {{cf[0]dr[0].x, cf[0]dr[0].y, cf[0]*dr[0].z},
	{cf[1]dr[1].x, cf[1]dr[1].y, cf[1]*dr[1].z},
	{cf[2]dr[2].x, cf[2]dr[2].y, cf[2]*dr[2].z},
	{cf[3]dr[3].x, cf[3]dr[3].y, cf[3]*dr[3].z}};
	sumF[0].x += dF[0].x;sumF[0].y += dF[0].y;sumF[0].z += dF[0].z;
	sumF[1].x += dF[1].x;sumF[1].y += dF[1].y;sumF[1].z += dF[1].z;
	sumF[2].x += dF[2].x;sumF[2].y += dF[2].y;sumF[2].z += dF[2].z;
	sumF[3].x += dF[3].x;sumF[3].y += dF[3].y;sumF[3].z += dF[3].z;
	}
	F[tid].x = sumF[0].x + sumF[1].x + sumF[2].x + sumF[3].x;
	F[tid].y = sumF[0].y + sumF[1].y + sumF[2].y + sumF[3].y;
	F[tid].z = sumF[0].z + sumF[1].z + sumF[2].z + sumF[3].z;
	}

	__global__ void test_kernel4(const float4* __restrict__ r,float3* __restrict__ F){
	const int tid = threadIdx.x + blockIdx.x * blockDim.x;
	const float4 ri = r[tid];
	float3 sumF = {0.0, 0.0, 0.0};
	for(int i=0; i<SYS_SIZE; i++){
	const float4 rj = r[i];
	const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
	const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
	const int crit = (dr2 < 1.0);
	const float inv_dr = rsqrt(dr2);
	const float dist_m_one = inv_dr - 1.0;
	const float cf = crit * dist_m_one;
	const float3 dF = {cf * dr.x,
	cf * dr.y,
	cf * dr.z};
	sumF.x += dF.x;
	sumF.y += dF.y;
	sumF.z += dF.z;
	}
	F[tid].x = sumF.x;
	F[tid].y = sumF.y;
	F[tid].z = sumF.z;
	}

	__global__ void test_kernel5(const float3* __restrict__ r,float3* __restrict__ F){
	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
	const float3 ri = r[tid];
	float3 sumF = {0.0, 0.0, 0.0};
	for(int i=0; i<SYS_SIZE; i++){
	const float3 rj = r[i];
	const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
	const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
	const int crit = (dr2 < 1.0);
	const float inv_dr = rsqrt(dr2);
	const float dist_m_one = inv_dr - 1.0;
	const float cf = crit * dist_m_one;
	const float3 dF = {cf * dr.x,
	cf * dr.y,
	cf * dr.z};
	sumF.x += dF.x;
	sumF.y += dF.y;
	sumF.z += dF.z;
	}
	F[tid].x = sumF.x;
	F[tid].y = sumF.y;
	F[tid].z = sumF.z;
	}

	__global__ void test_kernel6(const float4* __restrict__ r,float4* __restrict__ F){
	const int tid = threadIdx.x + blockIdx.x * blockDim.x;
	const float4 ri = r[tid];
	float4 sumF = {0.0, 0.0, 0.0};
	for(int i=0; i<SYS_SIZE; i++){
	const float4 rj = r[i];
	const float3 dr = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
	const float dr2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
	const int crit = (dr2 < 1.0);
	const float inv_dr = rsqrt(dr2);
	const float dist_m_one = inv_dr - 1.0;
	const float cf = crit * dist_m_one;
	const float3 dF = {cf * dr.x,
	cf * dr.y,
	cf * dr.z};
	sumF.x += dF.x;
	sumF.y += dF.y;
	sumF.z += dF.z;
	}
	F[tid] = sumF;
	}

	double time_dif(struct timeval &tv1, struct timeval &tv2)
	{
	double delta_t;
	delta_t = tv2.tv_sec - tv1.tv_sec + (tv2.tv_usec - tv1.tv_usec) * 1.0e-6;
	return delta_t;
	}

	int main(void){
	struct timeval tv1,tv2;
	using namespace std;
	cudaSetDevice(0);

	float x_d,x_h;
	float y_d,y_h;
	float z_d,z_h;
	float4 r4_d,r4_h;
	float3 r3_d,r3_h;

	float3 *F3_d;
	float4 *F4_d;
	x_h = new float [SYS_SIZE];
	y_h = new float [SYS_SIZE];
	z_h = new float [SYS_SIZE];
	r4_h = new float4 [SYS_SIZE];
	r3_h = new float3 [SYS_SIZE];

	cudaMalloc(&x_d,SYS_SIZE*sizeof(float));
	cudaMalloc(&y_d,SYS_SIZE*sizeof(float));
	cudaMalloc(&z_d,SYS_SIZE*sizeof(float));
	cudaMalloc(&r4_d,SYS_SIZE*sizeof(float4));
	cudaMalloc(&r3_d,SYS_SIZE*sizeof(float3));

	cudaMalloc(&F3_d,SYS_SIZE*sizeof(float3));
	cudaMalloc(&F4_d,SYS_SIZE*sizeof(float4));

	for(int i=0; i<SYS_SIZE; i++){
	x_h[i] = Uniform();
	y_h[i] = Uniform();
	z_h[i] = Uniform();
	r4_h[i].x = Uniform();
	r4_h[i].y = Uniform();
	r4_h[i].z = Uniform();
	r4_h[i].w = 0.0;
	r3_h[i].x = Uniform();
	r3_h[i].y = Uniform();
	r3_h[i].z = Uniform();
	}

	cudaMemcpy(x_d,x_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
	cudaMemcpy(y_d,y_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
	cudaMemcpy(z_d,z_h,SYS_SIZE*sizeof(float),cudaMemcpyHostToDevice);
	cudaMemcpy(r4_d,r4_h,SYS_SIZE*sizeof(float4),cudaMemcpyHostToDevice);
	cudaMemcpy(r3_d,r3_h,SYS_SIZE*sizeof(float3),cudaMemcpyHostToDevice);

	gettimeofday(&tv1,NULL);
	test_kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
	cudaDeviceSynchronize();
	gettimeofday(&tv2,NULL);

	cout << time_dif(tv1,tv2) << endl;

	gettimeofday(&tv1,NULL);
	test_kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
	cudaDeviceSynchronize();
	gettimeofday(&tv2,NULL);

	cout << time_dif(tv1,tv2) << endl;

	gettimeofday(&tv1,NULL);
	test_kernel3<<<GRID_SIZE,BLOCK_SIZE>>>(x_d,y_d,z_d,F3_d);
	cudaDeviceSynchronize();
	gettimeofday(&tv2,NULL);

	cout << time_dif(tv1,tv2) << endl;

	gettimeofday(&tv1,NULL);
	test_kernel4<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F3_d);
	cudaDeviceSynchronize();
	gettimeofday(&tv2,NULL);

	cout << time_dif(tv1,tv2) << endl;

	gettimeofday(&tv1,NULL);
	test_kernel5<<<GRID_SIZE,BLOCK_SIZE>>>(r3_d,F3_d);
	cudaDeviceSynchronize();
	gettimeofday(&tv2,NULL);

	cout << time_dif(tv1,tv2) << endl;

	gettimeofday(&tv1,NULL);
	test_kernel6<<<GRID_SIZE,BLOCK_SIZE>>>(r4_d,F4_d);
	cudaDeviceSynchronize();
	gettimeofday(&tv2,NULL);

	cout << time_dif(tv1,tv2) << endl;

	delete [] x_h;
	delete [] y_h;
	delete [] z_h;
	delete [] r4_h;
	delete [] r3_h;
	cudaFree(x_d);
	cudaFree(y_d);
	cudaFree(z_d);
	cudaFree(r4_d);
	cudaFree(r3_d);
	cudaFree(F3_d);
	cudaFree(F4_d);

	return 0;
	}