Last active
August 29, 2015 14:06
-
-
Save allanmac/ed033f20e566932680ff to your computer and use it in GitHub Desktop.
Try to push an integer math kernel's IPC metric as high as possible!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no int_ipc.cu -o int_ipc" ; -*- | |
// | |
// | |
// | |
#define KERNEL_QUALIFIERS __global__ | |
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS | |
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS | |
// | |
// | |
// | |
#ifndef _DEBUG | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
#else | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ | |
#endif | |
// | |
// | |
// | |
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS | |
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS | |
// | |
// | |
// | |
#define RESTRICT __restrict__ | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
int | |
add(const int a, const int b) | |
{ | |
int d; | |
asm("add.sat.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
int | |
mul(const int a, const int b) | |
{ | |
int d; | |
// asm("mad.hi.sat.s32 %0, %1, %2, 7;" : "=r"(d) : "r"(a), "r"(b)); | |
// asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
d = a * b; | |
return d; | |
} | |
// | |
// | |
// | |
#define IPC_2(v) \ | |
II(0,v); \ | |
II(1,v); | |
#define IPC_4(v) \ | |
IPC_2(v); \ | |
II(2,v); \ | |
II(3,v); | |
#define IPC_8(v) \ | |
IPC_4(v); \ | |
II(4,v); \ | |
II(5,v); \ | |
II(6,v); \ | |
II(7,v); | |
#define IPC_16(v) \ | |
IPC_8(v); \ | |
II(8,v); \ | |
II(9,v); \ | |
II(10,v); \ | |
II(11,v); \ | |
II(12,v); \ | |
II(13,v); \ | |
II(14,v); \ | |
II(15,v); | |
#define IPC_32(v) \ | |
IPC_16(v); \ | |
II(16,v); \ | |
II(17,v); \ | |
II(18,v); \ | |
II(19,v); \ | |
II(20,v); \ | |
II(21,v); \ | |
II(22,v); \ | |
II(23,v); \ | |
II(24,v); \ | |
II(25,v); \ | |
II(26,v); \ | |
II(27,v); \ | |
II(28,v); \ | |
II(29,v); \ | |
II(30,v); \ | |
II(31,v); | |
// | |
// | |
// | |
#define REP_2(v) \ | |
RR(0,v); \ | |
RR(1,v); \ | |
#define REP_4(v) \ | |
REP_2(v); \ | |
RR(2,v); \ | |
RR(3,v); | |
#define REP_8(v) \ | |
REP_4(v); \ | |
RR(4,v); \ | |
RR(5,v); \ | |
RR(6,v); \ | |
RR(7,v); | |
#define REP_16(v) \ | |
REP_8(v); \ | |
RR(8,v); \ | |
RR(9,v); \ | |
RR(10,v); \ | |
RR(11,v); \ | |
RR(12,v); \ | |
RR(13,v); \ | |
RR(14,v); \ | |
RR(15,v); | |
#define REP_32(v) \ | |
REP_16(v); \ | |
RR(16,v); \ | |
RR(17,v); \ | |
RR(18,v); \ | |
RR(19,v); \ | |
RR(20,v); \ | |
RR(21,v); \ | |
RR(22,v); \ | |
RR(23,v); \ | |
RR(24,v); \ | |
RR(25,v); \ | |
RR(26,v); \ | |
RR(27,v); \ | |
RR(28,v); \ | |
RR(29,v); \ | |
RR(30,v); \ | |
RR(31,v); | |
// | |
// | |
// | |
#define TYPE int | |
// | |
// | |
// | |
KERNEL_QUALIFIERS | |
void | |
int_ipc_kernel(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout) | |
{ | |
// | |
// FIXME -- PLEASE TRY OTHER IPC LEVELS OTHER THAN POWER OF TWO THAT I HAVE HERE | |
// | |
#if __CUDA_ARCH__ >= 500 // MAXWELL LIKES THIS | |
#define IPC(v) IPC_8(v) | |
#define REP(v) REP_4(v) | |
#elif __CUDA_ARCH__ >= 350 // GK208 | |
#define IPC(v) IPC_8(v) | |
#define REP(v) REP_4(v) | |
#elif __CUDA_ARCH__ >= 300 // GK104 | |
#define IPC(v) IPC_16(v) | |
#define REP(v) REP_4(v) | |
#else // FERMI and below | |
#define IPC(v) IPC_8(v) | |
#define REP(v) REP_4(v) | |
#endif | |
// | |
// | |
// | |
const unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x; | |
// | |
// LOAD | |
// | |
#undef II | |
#define II(a,v) TYPE s##a = vin[tid]; | |
IPC(0); | |
// | |
// PERFORM IPC * REP add_sat() ops | |
// | |
#undef II | |
#define II(a,v) s##a = mul(s##a,s##v); | |
#undef RR | |
#define RR(a,v) IPC(a) | |
// #pragma unroll | |
for (int ii=0; ii<16384; ii++) // spin a _lot_ of times | |
{ | |
REP(0); | |
} | |
// | |
// STORE | |
// | |
#undef II | |
#define II(a,v) vout[tid*(a+1)] = s##a; | |
IPC(0); | |
} | |
// | |
// | |
// | |
#define INT_IPC_MAIN | |
#ifdef INT_IPC_MAIN | |
// | |
// | |
// | |
#include <stdio.h> | |
int | |
main(int argc, char** argv) | |
{ | |
// int_ipc [device] [# of warps] [# of blocks] | |
const int device = (argc >= 2) ? atoi(argv[1]) : 0; | |
const int warps = (argc >= 3) ? atoi(argv[2]) : 16; | |
const int blocks = (argc >= 4) ? atoi(argv[3]) : 1; | |
cudaSetDevice(device); | |
// | |
// | |
// | |
cudaDeviceProp props; | |
cudaGetDeviceProperties(&props,device); | |
printf("%s (%2d)\n",props.name,props.multiProcessorCount); | |
// | |
// | |
// | |
const int threads = WARP_SIZE * warps; | |
printf("int_ipc_kernel<<<%d,%d>>>(...)\n",blocks,threads); | |
// | |
// ALLOCATE BUFFERS | |
// | |
TYPE* vin; | |
TYPE* vout; | |
cudaMalloc(&vin, sizeof(TYPE) * blocks * threads); | |
cudaMalloc(&vout,sizeof(TYPE) * blocks * threads); | |
// | |
// INIT VIN | |
// | |
// init vin[] if you want | |
// | |
// LAUNCH KERNEL | |
// | |
int_ipc_kernel<<<blocks,threads>>>(vin,vout); | |
cudaDeviceSynchronize(); | |
// | |
// FREE BUFFERS | |
// | |
cudaFree(vin); | |
cudaFree(vout); | |
cudaDeviceReset(); | |
return 0; | |
} | |
// | |
// | |
// | |
#endif // INT_IPC_MAIN | |
// | |
// | |
// |
On a GTX 750 Ti (Maxwell) an IPC of 4.05 is achieved:
>nvprof -m ipc int_ipc.exe 0 32
==5188== NVPROF is profiling process 5188, command: int_ipc.exe 0 32
GeForce GTX 750 Ti ( 5)
int_ipc_kernel<<<1,1024>>>(...)
==5188== Profiling application: int_ipc.exe 0 32
==5188== Profiling result:
==5188== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 750 Ti (0)"
Kernel: int_ipc_kernel(int const *, int*)
1 ipc Executed IPC 4.052755 4.052755 4.052755
On a GT 630 (GK208) an IPC of 3.53 is achieved:
>nvprof -m ipc int_ipc.exe 2 32
==8900== NVPROF is profiling process 8900, command: int_ipc.exe 2 32
GeForce GT 630 ( 2)
int_ipc_kernel<<<1,1024>>>(...)
==8900== Profiling application: int_ipc.exe 2 32
==8900== Profiling result:
==8900== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GT 630 (2)"
Kernel: int_ipc_kernel(int const *, int*)
1 ipc Executed IPC 3.533268 3.533268 3.533268
$ nvprof -m ipc int_ipc 0 32
==1656== NVPROF is profiling process 1656, command: int_ipc 0 32
GK20A ( 1)
int_ipc_kernel<<<1,1024>>>(...)
==1656== Profiling application: int_ipc 0 32
==1656== Profiling result:
==1656== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GK20A (0)"
Kernel: int_ipc_kernel(int const *, int*)
1 ipc Executed IPC 3.258458 3.258458 3.258458
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Built with:
Run with: