Skip to content

Instantly share code, notes, and snippets.

@oxyflour
Last active October 4, 2020 13:52
Show Gist options
  • Save oxyflour/47d4ea08606f95f506c16d9c7219e82a to your computer and use it in GitHub Desktop.
Save oxyflour/47d4ea08606f95f506c16d9c7219e82a to your computer and use it in GitHub Desktop.
fit cu
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "cuda_fp16.h"
// https://stackoverflow.com/a/27992604
#ifdef __INTELLISENSE__
typedef struct xyz {
int x;
int y;
int z;
} xyz;
xyz blockIdx;
xyz blockDim;
xyz threadIdx;
xyz gridDim;
#define CU_DIM(grid, block)
#define CU_DIM_MEM(grid, block, bytes)
#define CU_DIM_MEM_STREAM(grid, block, bytes, stream)
#else
#define CU_DIM(grid, block) <<<grid, block>>>
#define CU_DIM_MEM(grid, block, bytes) <<<grid, block, bytes>>>
#define CU_DIM_MEM_STREAM(grid, block, bytes, stream) <<<grid, block, bytes, stream>>>
#endif
#define CU_ASSERT(ans) do { gpuAssert((ans), __FILE__, __LINE__); } while (0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess) {
fprintf(stderr, "CUDA ERR: %s %s:%d\n", cudaGetErrorString(code), file, line);
if (abort) {
exit(code);
}
}
}
#define cuIdx(D) (threadIdx.D + blockIdx.D * blockDim.D)
#define cuDim(D) (blockDim.D * gridDim.D)
template <typename T> T *malloc_device(size_t sz) {
T* out = NULL;
if (sz > 0) {
CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz));
}
return out;
}
template <typename T> T *to_device(const T *in, size_t sz) {
T* out = NULL;
if (sz > 0) {
CU_ASSERT(cudaMalloc(&out, sizeof(T) * sz));
CU_ASSERT(cudaMemcpy(out, in, sizeof(T) * sz, cudaMemcpyDefault));
}
return out;
}
#include "cuda_utils.h"
#include <time.h>
constexpr int NX = 96, NY = 160, NZ = 160, NXY = NX * NY, NXYZ = NXY * NZ, NT = 500;
#define PERF(stat, NXYZ, NT) { \
auto start = clock(); \
stat; \
float total_time = ((float) (clock() - start) / CLOCKS_PER_SEC); \
float step_time = total_time / (NT); \
float perf = (NXYZ) / step_time / 1e6; \
printf("total time: %f s, step time: %f s, perf %f MCells/s\n", total_time, step_time, perf); \
}
class Chunk {
__device__ __forceinline__ int get_idx(int i, int j, int k) {
return i + j * NX + k * NXY;
}
__device__ __forceinline__ void get_ijk(int g, int &i, int &j, int &k) {
k = g / NXY;
g -= k * NXY;
j = g / NX;
g -= j * NX;
i = g;
}
public:
#ifdef USE_DYNAMIC_IDX
int NX, NY, NZ, NXY, NXYZ;
#endif
#ifdef USE_DYNAMIC_MEM
float *Ex, *Ey, *Ez, *Hx, *Hy, *Hz,
*LEx, *LEy, *LEz, *LHx, *LHy, *LHz,
*REx, *REy, *REz, *RHx, *RHy, *RHz;
#else
float Ex[NXYZ], Ey[NXYZ], Ez[NXYZ], Hx[NXYZ], Hy[NXYZ], Hz[NXYZ];
float LEx[NXYZ], LEy[NXYZ], LEz[NXYZ], LHx[NXYZ], LHy[NXYZ], LHz[NXYZ];
float REx[NXYZ], REy[NXYZ], REz[NXYZ], RHx[NXYZ], RHy[NXYZ], RHz[NXYZ];
#endif
__device__ __forceinline__ void step_h() {
int i, j, k;
for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) {
get_ijk(g, i, j, k);
if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) {
Hx[g] = LHx[g] * Hx[g] + RHx[g] * (Ey[get_idx(i, j, k+1)] - Ey[g] - Ez[get_idx(i, j+1, k)] + Ez[g]);
Hy[g] = LHy[g] * Hy[g] + RHy[g] * (Ez[get_idx(i+1, j, k)] - Ez[g] - Ex[get_idx(i, j, k+1)] + Ex[g]);
Hz[g] = LHz[g] * Hz[g] + RHz[g] * (Ex[get_idx(i, j+1, k)] - Ex[g] - Ey[get_idx(i+1, j, k)] + Ey[g]);
}
}
}
__device__ __forceinline__ void step_e() {
int i, j, k;
for (auto g = cuIdx(x); g < NXYZ; g += cuDim(x)) {
get_ijk(g, i, j, k);
if (i > 0 && j > 0 && k > 0 && i < NX - 1 && j < NY - 1 && k < NZ - 1) {
Ex[g] = LEx[g] * Ex[g] + REx[g] * (Hy[get_idx(i, j, k-1)] - Hy[g] - Hz[get_idx(i, j-1, k)] + Hz[g]);
Ey[g] = LEy[g] * Ey[g] + REy[g] * (Hz[get_idx(i-1, j, k)] - Hz[g] - Hx[get_idx(i, j, k-1)] + Hx[g]);
Ez[g] = LEz[g] * Ez[g] + REz[g] * (Hx[get_idx(i, j-1, k)] - Hx[g] - Hy[get_idx(i-1, j, k)] + Hy[g]);
}
}
}
};
__device__ Chunk chunk;
__global__ void kernel_init(int nx, int ny, int nz,
float *Ex, float *Ey, float *Ez, float *Hx, float *Hy, float *Hz,
float *LEx, float *LEy, float *LEz, float *LHx, float *LHy, float *LHz,
float *REx, float *REy, float *REz, float *RHx, float *RHy, float *RHz) {
#ifdef USE_DYNAMIC_IDX
chunk.NX = nx; chunk.NY = ny; chunk.NZ = nz;
chunk.NXY = nx * ny; chunk.NXYZ = nx * ny * nz;
#endif
#ifdef USE_DYNAMIC_MEM
chunk.Ex = Ex; chunk.Ey = Ey; chunk.Ez = Ez; chunk.Hx = Hx; chunk.Hy = Hy; chunk.Hz = Hz;
chunk.LEx = LEx; chunk.LEy = LEy; chunk.LEz = LEz; chunk.LHx = LHx; chunk.LHy = LHy; chunk.LHz = LHz;
chunk.REx = REx; chunk.REy = REy; chunk.REz = REz; chunk.RHx = RHx; chunk.RHy = RHy; chunk.RHz = RHz;
#endif
}
__global__ void kernel_step_h() {
chunk.step_h();
}
__global__ void kernel_step_e() {
chunk.step_e();
}
int main() {
kernel_init CU_DIM(1, 1) (NX, NY, NZ,
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ),
malloc_device<float>(NXYZ), malloc_device<float>(NXYZ), malloc_device<float>(NXYZ));
CU_ASSERT(cudaGetLastError());
cudaDeviceSynchronize();
PERF({
for (int t = 0; t < NT; t ++) {
kernel_step_h CU_DIM(2048, 512) ();
kernel_step_e CU_DIM(2048, 512) ();
}
cudaDeviceSynchronize();
}, NXYZ, NT);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment