Last active
October 4, 2020 13:52
-
-
Save oxyflour/47d4ea08606f95f506c16d9c7219e82a to your computer and use it in GitHub Desktop.
fit cu
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include "cuda_runtime.h" | |
#include "cuda_fp16.h" | |
// https://stackoverflow.com/a/27992604 | |
#ifdef __INTELLISENSE__ | |
typedef struct xyz { | |
int x; | |
int y; | |
int z; | |
} xyz; | |
xyz blockIdx; | |
xyz blockDim; | |
xyz threadIdx; | |
xyz gridDim; | |
#define CU_DIM(grid, block) | |
#define CU_DIM_MEM(grid, block, bytes) | |
#define CU_DIM_MEM_STREAM(grid, block, bytes, stream) | |
#else | |
#define CU_DIM(grid, block) <<<grid, block>>> | |
#define CU_DIM_MEM(grid, block, bytes) <<<grid, block, bytes>>> | |
#define CU_DIM_MEM_STREAM(grid, block, bytes, stream) <<<grid, block, bytes, stream>>> | |
#endif | |
#define CU_ASSERT(ans) do { gpuAssert((ans), __FILE__, __LINE__); } while (0) | |
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) | |
{ | |
if (code != cudaSuccess) { | |
fprintf(stderr, "CUDA ERR: %s %s:%d\n", cudaGetErrorString(code), file, line); | |
if (abort) { | |
exit(code); | |
} | |
} | |
} | |
#define cuIdx(D) (threadIdx.D + blockIdx.D * blockDim.D) | |
#define cuDim(D) (blockDim.D * gridDim.D) | |
template <typename T> T *malloc_device(size_t sz) { | |
T* out = NULL; | |
if (sz > 0) { | |
CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz)); | |
} | |
return out; | |
} | |
template <typename T> T *to_device(const T *in, size_t sz) { | |
T* out = NULL; | |
if (sz > 0) { | |
CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz)); | |
CU_ASSERT(cudaMemcpy(out, in, sizeof(T) * sz, cudaMemcpyDefault)); | |
} | |
return out; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_utils.h" | |
#include <time.h> | |
constexpr int NX = 96, NY = 160, NZ = 160, NXY = NX * NY, NXYZ = NXY * NZ, NT = 500; | |
#define PERF(stat, NXYZ, NT) { \ | |
auto start = clock(); \ | |
stat; \ | |
float total_time = ((float) (clock() - start) / CLOCKS_PER_SEC); \ | |
float step_time = total_time / (NT); \ | |
float perf = (NXYZ) / step_time / 1e6; \ | |
printf("total time: %f s, step time: %f s, perf %f MCells/s\n", total_time, step_time, perf); \ | |
} | |
class Chunk { | |
__device__ __forceinline__ int get_idx(int i, int j, int k) { | |
return i + j * NX + k * NXY; | |
} | |
__device__ __forceinline__ void get_ijk(int g, int &i, int &j, int &k) { | |
k = g / NXY; | |
g -= k * NXY; | |
j = g / NX; | |
g -= j * NX; | |
i = g; | |
} | |
public: | |
#ifdef USE_DYNAMIC_IDX | |
int NX, NY, NZ, NXY, NXYZ; | |
#endif | |
#ifdef USE_DYNAMIC_MEM | |
float *Ex, *Ey, *Ez, *Hx, *Hy, *Hz, | |
*LEx, *LEy, *LEz, *LHx, *LHy, *LHz, | |
*REx, *REy, *REz, *RHx, *RHy, *RHz; | |
#else | |
float Ex[NXYZ], Ey[NXYZ], Ez[NXYZ], Hx[NXYZ], Hy[NXYZ], Hz[NXYZ]; | |
float LEx[NXYZ], LEy[NXYZ], LEz[NXYZ], LHx[NXYZ], LHy[NXYZ], LHz[NXYZ]; | |
float REx[NXYZ], REy[NXYZ], REz[NXYZ], RHx[NXYZ], RHy[NXYZ], RHz[NXYZ]; | |
#endif | |
__device__ __forceinline__ void step_h() { | |
int i, j, k; | |
for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) { | |
get_ijk(g, i, j, k); | |
if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) { | |
Hx[g] = LHx[g] * Hx[g] + RHx[g] * (Ey[get_idx(i, j, k+1)] - Ey[g] - Ez[get_idx(i, j+1, k)] + Ez[g]); | |
Hy[g] = LHy[g] * Hy[g] + RHy[g] * (Ez[get_idx(i+1, j, k)] - Ez[g] - Ex[get_idx(i, j, k+1)] + Ex[g]); | |
Hz[g] = LHz[g] * Hz[g] + RHz[g] * (Ex[get_idx(i, j+1, k)] - Ex[g] - Ey[get_idx(i+1, j, k)] + Ey[g]); | |
} | |
} | |
} | |
__device__ __forceinline__ void step_e() { | |
int i, j, k; | |
for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) { | |
get_ijk(g, i, j, k); | |
if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) { | |
Ex[g] = LEx[g] * Ex[g] + REx[g] * (Hy[get_idx(i, j, k-1)] - Hy[g] - Hz[get_idx(i, j-1, k)] + Hz[g]); | |
Ey[g] = LEy[g] * Ey[g] + REy[g] * (Hz[get_idx(i-1, j, k)] - Hz[g] - Hx[get_idx(i, j, k-1)] + Hx[g]); | |
Ez[g] = LEz[g] * Ez[g] + REz[g] * (Hx[get_idx(i, j-1, k)] - Hx[g] - Hy[get_idx(i-1, j, k)] + Hy[g]); | |
} | |
} | |
} | |
}; | |
__device__ Chunk chunk; | |
__global__ void kernel_init(int nx, int ny, int nz, | |
float *Ex, float *Ey, float *Ez, float *Hx, float *Hy, float *Hz, | |
float *LEx, float *LEy, float *LEz, float *LHx, float *LHy, float *LHz, | |
float *REx, float *REy, float *REz, float *RHx, float *RHy, float *RHz) { | |
#ifdef USE_DYNAMIC_IDX | |
chunk.NX = nx; chunk.NY = ny; chunk.NZ = nz; | |
chunk.NXY = nx * ny; chunk.NXYZ = nx * ny * nz; | |
#endif | |
#ifdef USE_DYNAMIC_MEM | |
chunk.Ex = Ex; chunk.Ey = Ey; chunk.Ez = Ez; chunk.Hx = Hx; chunk.Hy = Hy; chunk.Hz = Hz; | |
chunk.LEx = LEx; chunk.LEy = LEy; chunk.LEz = LEz; chunk.LHx = LHx; chunk.LHy = LHy; chunk.LHz = LHz; | |
chunk.REx = REx; chunk.REy = REy; chunk.REz = REz; chunk.RHx = RHx; chunk.RHy = RHy; chunk.RHz = RHz; | |
#endif | |
} | |
__global__ void kernel_step_h() { | |
chunk.step_h(); | |
} | |
__global__ void kernel_step_e() { | |
chunk.step_e(); | |
} | |
int main() { | |
kernel_init CU_DIM(1, 1) (NX, NY, NZ, | |
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), | |
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), | |
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), | |
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), | |
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), | |
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ)); | |
CU_ASSERT(cudaGetLastError()); | |
cudaDeviceSynchronize(); | |
PERF({ | |
for (int t = 0; t < NT; t ++) { | |
kernel_step_h CU_DIM(2048, 512) (); | |
kernel_step_e CU_DIM(2048, 512) (); | |
} | |
cudaDeviceSynchronize(); | |
}, NXYZ, NT); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment