oxyflour · October 4, 2020 13:52
diff --git a/cuda_utils.h b/cuda_utils.h
 #include <stdio.h>
 #include <stdlib.h>

 #include "cuda_runtime.h"
 #include "cuda_fp16.h"

 // https://stackoverflow.com/a/27992604
 #ifdef __INTELLISENSE__
 typedef struct xyz {
    int x;
    int y;
    int z;
 } xyz;
 xyz blockIdx;
 xyz blockDim;
 xyz threadIdx;
 xyz gridDim;
 #define CU_DIM(grid, block)
 #define CU_DIM_MEM(grid, block, bytes)
 #define CU_DIM_MEM_STREAM(grid, block, bytes, stream)
 #else
 #define CU_DIM(grid, block) <<<grid, block>>>
 #define CU_DIM_MEM(grid, block, bytes) <<<grid, block, bytes>>>
 #define CU_DIM_MEM_STREAM(grid, block, bytes, stream) <<<grid, block, bytes, stream>>>
 #endif

 #define CU_ASSERT(ans) do { gpuAssert((ans), __FILE__, __LINE__); } while (0)
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
 {
    if (code != cudaSuccess) {
        fprintf(stderr, "CUDA ERR: %s %s:%d\n", cudaGetErrorString(code), file, line);
        if (abort) {
            exit(code);
        }
    }
 }

 #define cuIdx(D) (threadIdx.D + blockIdx.D * blockDim.D)
 #define cuDim(D) (blockDim.D * gridDim.D)

 template <typename T> T *malloc_device(size_t sz) {
    T* out = NULL;
    if (sz > 0) {
        CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz));
    }
    return out;
 }

 template <typename T> T *to_device(const T *in, size_t sz) {
    T* out = NULL;
    if (sz > 0) {
        CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz));
        CU_ASSERT(cudaMemcpy(out, in, sizeof(T) * sz, cudaMemcpyDefault));
    }
    return out;
 }
diff --git a/main.cu b/main.cu
 #include "cuda_utils.h"
 #include <time.h>

 constexpr int NX = 96, NY = 160, NZ = 160, NXY = NX * NY, NXYZ = NXY * NZ, NT = 500;
 #define PERF(stat, NXYZ, NT) { \
    auto start = clock(); \
    stat; \
    float total_time = ((float) (clock() - start) / CLOCKS_PER_SEC); \
    float step_time = total_time / (NT); \
    float perf = (NXYZ) / step_time / 1e6; \
    printf("total time: %f s, step time: %f s, perf %f MCells/s\n", total_time, step_time, perf); \
 }

 class Chunk {
    __device__ __forceinline__ int get_idx(int i, int j, int k) {
        return i + j * NX + k * NXY;
    }
    __device__ __forceinline__ void get_ijk(int g, int &i, int &j, int &k) {
        k = g / NXY;
        g -= k * NXY;
        j = g / NX;
        g -= j * NX;
        i = g;
    }
 public:
 #ifdef USE_DYNAMIC_IDX
    int NX, NY, NZ, NXY, NXYZ;
 #endif
 #ifdef USE_DYNAMIC_MEM
    float *Ex, *Ey, *Ez, *Hx, *Hy, *Hz,
        *LEx, *LEy, *LEz, *LHx, *LHy, *LHz,
        *REx, *REy, *REz, *RHx, *RHy, *RHz;
 #else
    float Ex[NXYZ], Ey[NXYZ], Ez[NXYZ], Hx[NXYZ], Hy[NXYZ], Hz[NXYZ];
    float LEx[NXYZ], LEy[NXYZ], LEz[NXYZ], LHx[NXYZ], LHy[NXYZ], LHz[NXYZ];
    float REx[NXYZ], REy[NXYZ], REz[NXYZ], RHx[NXYZ], RHy[NXYZ], RHz[NXYZ];
 #endif
    __device__ __forceinline__ void step_h() {
        int i, j, k;
        for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) {
            get_ijk(g, i, j, k);
            if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) {
                Hx[g] = LHx[g] * Hx[g] + RHx[g] * (Ey[get_idx(i, j, k+1)] - Ey[g] - Ez[get_idx(i, j+1, k)] + Ez[g]);
                Hy[g] = LHy[g] * Hy[g] + RHy[g] * (Ez[get_idx(i+1, j, k)] - Ez[g] - Ex[get_idx(i, j, k+1)] + Ex[g]);
                Hz[g] = LHz[g] * Hz[g] + RHz[g] * (Ex[get_idx(i, j+1, k)] - Ex[g] - Ey[get_idx(i+1, j, k)] + Ey[g]);
            }
        }
    }
    __device__ __forceinline__ void step_e() {
        int i, j, k;
        for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) {
            get_ijk(g, i, j, k);
            if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) {
                Ex[g] = LEx[g] * Ex[g] + REx[g] * (Hy[get_idx(i, j, k-1)] - Hy[g] - Hz[get_idx(i, j-1, k)] + Hz[g]);
                Ey[g] = LEy[g] * Ey[g] + REy[g] * (Hz[get_idx(i-1, j, k)] - Hz[g] - Hx[get_idx(i, j, k-1)] + Hx[g]);
                Ez[g] = LEz[g] * Ez[g] + REz[g] * (Hx[get_idx(i, j-1, k)] - Hx[g] - Hy[get_idx(i-1, j, k)] + Hy[g]);
            }
        }
    }
 };

 __device__ Chunk chunk;
 __global__ void kernel_init(int nx, int ny, int nz,
    float *Ex, float *Ey, float *Ez, float *Hx, float *Hy, float *Hz,
    float *LEx, float *LEy, float *LEz, float *LHx, float *LHy, float *LHz,
    float *REx, float *REy, float *REz, float *RHx, float *RHy, float *RHz) {
 #ifdef USE_DYNAMIC_IDX
    chunk.NX = nx; chunk.NY = ny; chunk.NZ = nz;
    chunk.NXY = nx * ny; chunk.NXYZ = nx * ny * nz;
 #endif
 #ifdef USE_DYNAMIC_MEM
    chunk.Ex = Ex; chunk.Ey = Ey; chunk.Ez = Ez; chunk.Hx = Hx; chunk.Hy = Hy; chunk.Hz = Hz;
    chunk.LEx = LEx; chunk.LEy = LEy; chunk.LEz = LEz; chunk.LHx = LHx; chunk.LHy = LHy; chunk.LHz = LHz;
    chunk.REx = REx; chunk.REy = REy; chunk.REz = REz; chunk.RHx = RHx; chunk.RHy = RHy; chunk.RHz = RHz;
 #endif
 }
 __global__ void kernel_step_h() {
    chunk.step_h();
 }
 __global__ void kernel_step_e() {
    chunk.step_e();
 }

 int main() {
    kernel_init CU_DIM(1, 1) (NX, NY, NZ,
        malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
        malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
        malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
        malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
        malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
        malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ));
    CU_ASSERT(cudaGetLastError());
    cudaDeviceSynchronize();

    PERF({
        for (int t = 0; t < NT; t ++) {
            kernel_step_h CU_DIM(2048, 512) ();
            kernel_step_e CU_DIM(2048, 512) ();
        }
        cudaDeviceSynchronize();
    }, NXYZ, NT);
    return 0;
 }
	#include <stdio.h>
	#include <stdlib.h>

	#include "cuda_runtime.h"
	#include "cuda_fp16.h"

	// https://stackoverflow.com/a/27992604
	#ifdef __INTELLISENSE__
	typedef struct xyz {
	int x;
	int y;
	int z;
	} xyz;
	xyz blockIdx;
	xyz blockDim;
	xyz threadIdx;
	xyz gridDim;
	#define CU_DIM(grid, block)
	#define CU_DIM_MEM(grid, block, bytes)
	#define CU_DIM_MEM_STREAM(grid, block, bytes, stream)
	#else
	#define CU_DIM(grid, block) <<<grid, block>>>
	#define CU_DIM_MEM(grid, block, bytes) <<<grid, block, bytes>>>
	#define CU_DIM_MEM_STREAM(grid, block, bytes, stream) <<<grid, block, bytes, stream>>>
	#endif

	#define CU_ASSERT(ans) do { gpuAssert((ans), __FILE__, __LINE__); } while (0)
	inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
	{
	if (code != cudaSuccess) {
	fprintf(stderr, "CUDA ERR: %s %s:%d\n", cudaGetErrorString(code), file, line);
	if (abort) {
	exit(code);
	}
	}
	}

	#define cuIdx(D) (threadIdx.D + blockIdx.D * blockDim.D)
	#define cuDim(D) (blockDim.D * gridDim.D)

	template <typename T> T *malloc_device(size_t sz) {
	T* out = NULL;
	if (sz > 0) {
	CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz));
	}
	return out;
	}

	template <typename T> T to_device(const T in, size_t sz) {
	T* out = NULL;
	if (sz > 0) {
	CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz));
	CU_ASSERT(cudaMemcpy(out, in, sizeof(T) * sz, cudaMemcpyDefault));
	}
	return out;
	}
	#include "cuda_utils.h"
	#include <time.h>

	constexpr int NX = 96, NY = 160, NZ = 160, NXY = NX * NY, NXYZ = NXY * NZ, NT = 500;
	#define PERF(stat, NXYZ, NT) { \
	auto start = clock(); \
	stat; \
	float total_time = ((float) (clock() - start) / CLOCKS_PER_SEC); \
	float step_time = total_time / (NT); \
	float perf = (NXYZ) / step_time / 1e6; \
	printf("total time: %f s, step time: %f s, perf %f MCells/s\n", total_time, step_time, perf); \
	}

	class Chunk {
	__device__ __forceinline__ int get_idx(int i, int j, int k) {
	return i + j * NX + k * NXY;
	}
	__device__ __forceinline__ void get_ijk(int g, int &i, int &j, int &k) {
	k = g / NXY;
	g -= k * NXY;
	j = g / NX;
	g -= j * NX;
	i = g;
	}
	public:
	#ifdef USE_DYNAMIC_IDX
	int NX, NY, NZ, NXY, NXYZ;
	#endif
	#ifdef USE_DYNAMIC_MEM
	float Ex, Ey, Ez, Hx, Hy, Hz,
	LEx, LEy, LEz, LHx, LHy, LHz,
	REx, REy, REz, RHx, RHy, RHz;
	#else
	float Ex[NXYZ], Ey[NXYZ], Ez[NXYZ], Hx[NXYZ], Hy[NXYZ], Hz[NXYZ];
	float LEx[NXYZ], LEy[NXYZ], LEz[NXYZ], LHx[NXYZ], LHy[NXYZ], LHz[NXYZ];
	float REx[NXYZ], REy[NXYZ], REz[NXYZ], RHx[NXYZ], RHy[NXYZ], RHz[NXYZ];
	#endif
	__device__ __forceinline__ void step_h() {
	int i, j, k;
	for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) {
	get_ijk(g, i, j, k);
	if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) {
	Hx[g] = LHx[g] * Hx[g] + RHx[g] * (Ey[get_idx(i, j, k+1)] - Ey[g] - Ez[get_idx(i, j+1, k)] + Ez[g]);
	Hy[g] = LHy[g] * Hy[g] + RHy[g] * (Ez[get_idx(i+1, j, k)] - Ez[g] - Ex[get_idx(i, j, k+1)] + Ex[g]);
	Hz[g] = LHz[g] * Hz[g] + RHz[g] * (Ex[get_idx(i, j+1, k)] - Ex[g] - Ey[get_idx(i+1, j, k)] + Ey[g]);
	}
	}
	}
	__device__ __forceinline__ void step_e() {
	int i, j, k;
	for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) {
	get_ijk(g, i, j, k);
	if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) {
	Ex[g] = LEx[g] * Ex[g] + REx[g] * (Hy[get_idx(i, j, k-1)] - Hy[g] - Hz[get_idx(i, j-1, k)] + Hz[g]);
	Ey[g] = LEy[g] * Ey[g] + REy[g] * (Hz[get_idx(i-1, j, k)] - Hz[g] - Hx[get_idx(i, j, k-1)] + Hx[g]);
	Ez[g] = LEz[g] * Ez[g] + REz[g] * (Hx[get_idx(i, j-1, k)] - Hx[g] - Hy[get_idx(i-1, j, k)] + Hy[g]);
	}
	}
	}
	};

	__device__ Chunk chunk;
	__global__ void kernel_init(int nx, int ny, int nz,
	float Ex, float Ey, float Ez, float Hx, float Hy, float Hz,
	float LEx, float LEy, float LEz, float LHx, float LHy, float LHz,
	float REx, float REy, float REz, float RHx, float RHy, float RHz) {
	#ifdef USE_DYNAMIC_IDX
	chunk.NX = nx; chunk.NY = ny; chunk.NZ = nz;
	chunk.NXY = nx * ny; chunk.NXYZ = nx * ny * nz;
	#endif
	#ifdef USE_DYNAMIC_MEM
	chunk.Ex = Ex; chunk.Ey = Ey; chunk.Ez = Ez; chunk.Hx = Hx; chunk.Hy = Hy; chunk.Hz = Hz;
	chunk.LEx = LEx; chunk.LEy = LEy; chunk.LEz = LEz; chunk.LHx = LHx; chunk.LHy = LHy; chunk.LHz = LHz;
	chunk.REx = REx; chunk.REy = REy; chunk.REz = REz; chunk.RHx = RHx; chunk.RHy = RHy; chunk.RHz = RHz;
	#endif
	}
	__global__ void kernel_step_h() {
	chunk.step_h();
	}
	__global__ void kernel_step_e() {
	chunk.step_e();
	}

	int main() {
	kernel_init CU_DIM(1, 1) (NX, NY, NZ,
	malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
	malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
	malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
	malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
	malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
	malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ));
	CU_ASSERT(cudaGetLastError());
	cudaDeviceSynchronize();

	PERF({
	for (int t = 0; t < NT; t ++) {
	kernel_step_h CU_DIM(2048, 512) ();
	kernel_step_e CU_DIM(2048, 512) ();
	}
	cudaDeviceSynchronize();
	}, NXYZ, NT);
	return 0;
	}