Created
February 12, 2018 21:57
-
-
Save anonymous/e6cb5ac022b67e3417f4a186a1194ff6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[WARNING]: No mapping options supplied. 'Naive' options will be used which might fail compilation | |
[WARNING]: Autotuning results won't be cached. 'cache' option is not specified | |
[WARNING]: Using naive options for autotuning | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (2, 1)/10 Time (us): best: 13146 median: 13146 worst: 13146 Generation 0 Job[Compiled, GPU] (2, 1)/10 Time (us): best: 13146 median: 13146 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[4][20][10][11]; | |
for (int c1 = 0; c1 <= 31; c1 += 20) { | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) { | |
for (int c5 = t1; c5 <= 9; c5 += 5) { | |
for (int c6 = t0; c6 <= 9; c6 += 4) { | |
_output_0[c3][c4][c5][c6] = output[c3][c1 + c4][c5][c6]; | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 9; c3 += 8) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= min(19, -c1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (t1 == 0 && t2 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) { | |
for (int c5 = t1; c5 <= 9; c5 += 5) { | |
for (int c6 = t0; c6 <= 9; c6 += 4) { | |
output[c3][c1 + c4][c5][c6] = _output_0[c3][c4][c5][c6]; | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 7716 median: 13146 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (5, 4)/10 Time (us): best: 7716 median: 11094 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c1 = 0; c1 <= 31; c1 += 16) { | |
for (int c2 = 0; c2 <= 9; c2 += 4) { | |
for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) { | |
for (int c5 = c1; c5 <= c1 + 15; c5 += 1) { | |
for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 5871 median: 7900 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[2][32][10][11]; | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 1; c3 += 1) { | |
for (int c4 = 0; c4 <= 31; c4 += 1) { | |
_output_0[c3][c4][t1][t0] = output[2*b0 + c3][c4][t1][t0]; | |
} | |
} | |
__syncthreads(); | |
for (int c4 = 0; c4 <= 1; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][c5][c6][c7] = (_output_0[c4][c5][c6][c7] + input[2*b0 + c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 1; c3 += 1) { | |
for (int c4 = 0; c4 <= 31; c4 += 1) { | |
output[2*b0 + c3][c4][t1][t0] = _output_0[c3][c4][t1][t0]; | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 5871 median: 7900 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c1 = 0; c1 <= 31; c1 += 4) { | |
for (int c2 = 0; c2 <= 9; c2 += 1) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 3; c5 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c1 + c5][c2][c7] = (output[c4][c1 + c5][c2][c7] + input[c4][c1 + c5][t0 + 2*c2][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 3472 median: 7900 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c2 = 0; c2 <= 9; c2 += 5) { | |
for (int c3 = 0; c3 <= 9; c3 += 2) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) { | |
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) { | |
for (int c7 = c3; c7 <= c3 + 1; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c3 = 0; c3 <= 9; c3 += 8) { | |
for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) { | |
for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 4; c6 += 1) { | |
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[3*b0 + c4][20*b1 + c5][5*b2 + c6][c3 + c7] = (output[3*b0 + c4][20*b1 + c5][5*b2 + c6][c3 + c7] + input[3*b0 + c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (9, 8)/10 Time (us): best: 3472 median: 7900 worst: 13146 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c2 = 0; c2 <= 9; c2 += 5) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 0 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 2044 median: 7716 worst: 13146 Generation 0 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 2044 median: 7716 worst: 13146 | |
Generation 1 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 2042 median: 3471 worst: 3471 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c2 = 0; c2 <= 9; c2 += 5) { | |
for (int c3 = 0; c3 <= 9; c3 += 2) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) { | |
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) { | |
for (int c7 = c3; c7 <= c3 + 1; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 1 Job[Compiled, GPU] (5, 4)/10 Time (us): best: 2042 median: 3471 worst: 6026 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 1 Job[Compiled, GPU] (7, 6)/10 Time (us): best: 2042 median: 6026 worst: 7900 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c1 = 4 * b1; c1 <= 31; c1 += 8) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 3; c5 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c1 + c5][b2][c7] = (output[c4][c1 + c5][b2][c7] + input[c4][c1 + c5][t0 + 2*b2][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 1 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 2042 median: 6026 worst: 7900 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= min(2, -3 * b1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][3*b1 + c5][c6][c7] = (output[c4][3*b1 + c5][c6][c7] + input[c4][3*b1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 1 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 606 median: 5872 worst: 7900 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[4][20][5][11]; | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
_output_0[c3][c4][c5][t0] = output[c3][20*b1 + c4][5*b2 + c5][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 9; c3 += 8) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 4; c6 += 1) { | |
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
output[c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 1 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 606 median: 5872 worst: 7900 Generation 1 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 606 median: 3471 worst: 7900 Generation 1 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 606 median: 3471 worst: 7900 | |
Generation 2 Job[Compiled, GPU] (2, 1)/10 Time (us): best: 604 median: 604 worst: 604 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[4][20][5][11]; | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
_output_0[c3][c4][c5][t0] = output[c3][20*b1 + c4][5*b2 + c5][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) { | |
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][-20*b1 + c5][-5*b2 + c6][c7] = (_output_0[c4][-20*b1 + c5][-5*b2 + c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
output[c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 604 median: 604 worst: 604 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= min(2, -3 * b1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][3*b1 + c5][c6][c7] = (output[c4][3*b1 + c5][c6][c7] + input[c4][3*b1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[3][20][5][11]; | |
__syncthreads(); | |
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) { | |
_output_0[c3][0][0][t0] = output[3*b0 + c3][20*b1][5*b2][t0]; | |
_output_0[c3][0][1][t0] = output[3*b0 + c3][20*b1][1 + 5*b2][t0]; | |
_output_0[c3][0][2][t0] = output[3*b0 + c3][20*b1][2 + 5*b2][t0]; | |
_output_0[c3][0][3][t0] = output[3*b0 + c3][20*b1][3 + 5*b2][t0]; | |
_output_0[c3][0][4][t0] = output[3*b0 + c3][20*b1][4 + 5*b2][t0]; | |
_output_0[c3][1][0][t0] = output[3*b0 + c3][1 + 20*b1][5*b2][t0]; | |
_output_0[c3][1][1][t0] = output[3*b0 + c3][1 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][1][2][t0] = output[3*b0 + c3][1 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][1][3][t0] = output[3*b0 + c3][1 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][1][4][t0] = output[3*b0 + c3][1 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][2][0][t0] = output[3*b0 + c3][2 + 20*b1][5*b2][t0]; | |
_output_0[c3][2][1][t0] = output[3*b0 + c3][2 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][2][2][t0] = output[3*b0 + c3][2 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][2][3][t0] = output[3*b0 + c3][2 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][2][4][t0] = output[3*b0 + c3][2 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][3][0][t0] = output[3*b0 + c3][3 + 20*b1][5*b2][t0]; | |
_output_0[c3][3][1][t0] = output[3*b0 + c3][3 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][3][2][t0] = output[3*b0 + c3][3 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][3][3][t0] = output[3*b0 + c3][3 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][3][4][t0] = output[3*b0 + c3][3 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][4][0][t0] = output[3*b0 + c3][4 + 20*b1][5*b2][t0]; | |
_output_0[c3][4][1][t0] = output[3*b0 + c3][4 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][4][2][t0] = output[3*b0 + c3][4 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][4][3][t0] = output[3*b0 + c3][4 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][4][4][t0] = output[3*b0 + c3][4 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][5][0][t0] = output[3*b0 + c3][5 + 20*b1][5*b2][t0]; | |
_output_0[c3][5][1][t0] = output[3*b0 + c3][5 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][5][2][t0] = output[3*b0 + c3][5 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][5][3][t0] = output[3*b0 + c3][5 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][5][4][t0] = output[3*b0 + c3][5 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][6][0][t0] = output[3*b0 + c3][6 + 20*b1][5*b2][t0]; | |
_output_0[c3][6][1][t0] = output[3*b0 + c3][6 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][6][2][t0] = output[3*b0 + c3][6 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][6][3][t0] = output[3*b0 + c3][6 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][6][4][t0] = output[3*b0 + c3][6 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][7][0][t0] = output[3*b0 + c3][7 + 20*b1][5*b2][t0]; | |
_output_0[c3][7][1][t0] = output[3*b0 + c3][7 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][7][2][t0] = output[3*b0 + c3][7 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][7][3][t0] = output[3*b0 + c3][7 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][7][4][t0] = output[3*b0 + c3][7 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][8][0][t0] = output[3*b0 + c3][8 + 20*b1][5*b2][t0]; | |
_output_0[c3][8][1][t0] = output[3*b0 + c3][8 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][8][2][t0] = output[3*b0 + c3][8 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][8][3][t0] = output[3*b0 + c3][8 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][8][4][t0] = output[3*b0 + c3][8 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][9][0][t0] = output[3*b0 + c3][9 + 20*b1][5*b2][t0]; | |
_output_0[c3][9][1][t0] = output[3*b0 + c3][9 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][9][2][t0] = output[3*b0 + c3][9 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][9][3][t0] = output[3*b0 + c3][9 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][9][4][t0] = output[3*b0 + c3][9 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][10][0][t0] = output[3*b0 + c3][10 + 20*b1][5*b2][t0]; | |
_output_0[c3][10][1][t0] = output[3*b0 + c3][10 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][10][2][t0] = output[3*b0 + c3][10 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][10][3][t0] = output[3*b0 + c3][10 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][10][4][t0] = output[3*b0 + c3][10 + 20*b1][4 + 5*b2][t0]; | |
_output_0[c3][11][0][t0] = output[3*b0 + c3][11 + 20*b1][5*b2][t0]; | |
_output_0[c3][11][1][t0] = output[3*b0 + c3][11 + 20*b1][1 + 5*b2][t0]; | |
_output_0[c3][11][2][t0] = output[3*b0 + c3][11 + 20*b1][2 + 5*b2][t0]; | |
_output_0[c3][11][3][t0] = output[3*b0 + c3][11 + 20*b1][3 + 5*b2][t0]; | |
_output_0[c3][11][4][t0] = output[3*b0 + c3][11 + 20*b1][4 + 5*b2][t0]; | |
if (b1 == 0) { | |
_output_0[c3][12][0][t0] = output[3*b0 + c3][12][5*b2][t0]; | |
_output_0[c3][12][1][t0] = output[3*b0 + c3][12][1 + 5*b2][t0]; | |
_output_0[c3][12][2][t0] = output[3*b0 + c3][12][2 + 5*b2][t0]; | |
_output_0[c3][12][3][t0] = output[3*b0 + c3][12][3 + 5*b2][t0]; | |
_output_0[c3][12][4][t0] = output[3*b0 + c3][12][4 + 5*b2][t0]; | |
_output_0[c3][13][0][t0] = output[3*b0 + c3][13][5*b2][t0]; | |
_output_0[c3][13][1][t0] = output[3*b0 + c3][13][1 + 5*b2][t0]; | |
_output_0[c3][13][2][t0] = output[3*b0 + c3][13][2 + 5*b2][t0]; | |
_output_0[c3][13][3][t0] = output[3*b0 + c3][13][3 + 5*b2][t0]; | |
_output_0[c3][13][4][t0] = output[3*b0 + c3][13][4 + 5*b2][t0]; | |
_output_0[c3][14][0][t0] = output[3*b0 + c3][14][5*b2][t0]; | |
_output_0[c3][14][1][t0] = output[3*b0 + c3][14][1 + 5*b2][t0]; | |
_output_0[c3][14][2][t0] = output[3*b0 + c3][14][2 + 5*b2][t0]; | |
_output_0[c3][14][3][t0] = output[3*b0 + c3][14][3 + 5*b2][t0]; | |
_output_0[c3][14][4][t0] = output[3*b0 + c3][14][4 + 5*b2][t0]; | |
_output_0[c3][15][0][t0] = output[3*b0 + c3][15][5*b2][t0]; | |
_output_0[c3][15][1][t0] = output[3*b0 + c3][15][1 + 5*b2][t0]; | |
_output_0[c3][15][2][t0] = output[3*b0 + c3][15][2 + 5*b2][t0]; | |
_output_0[c3][15][3][t0] = output[3*b0 + c3][15][3 + 5*b2][t0]; | |
_output_0[c3][15][4][t0] = output[3*b0 + c3][15][4 + 5*b2][t0]; | |
_output_0[c3][16][0][t0] = output[3*b0 + c3][16][5*b2][t0]; | |
_output_0[c3][16][1][t0] = output[3*b0 + c3][16][1 + 5*b2][t0]; | |
_output_0[c3][16][2][t0] = output[3*b0 + c3][16][2 + 5*b2][t0]; | |
_output_0[c3][16][3][t0] = output[3*b0 + c3][16][3 + 5*b2][t0]; | |
_output_0[c3][16][4][t0] = output[3*b0 + c3][16][4 + 5*b2][t0]; | |
_output_0[c3][17][0][t0] = output[3*b0 + c3][17][5*b2][t0]; | |
_output_0[c3][17][1][t0] = output[3*b0 + c3][17][1 + 5*b2][t0]; | |
_output_0[c3][17][2][t0] = output[3*b0 + c3][17][2 + 5*b2][t0]; | |
_output_0[c3][17][3][t0] = output[3*b0 + c3][17][3 + 5*b2][t0]; | |
_output_0[c3][17][4][t0] = output[3*b0 + c3][17][4 + 5*b2][t0]; | |
_output_0[c3][18][0][t0] = output[3*b0 + c3][18][5*b2][t0]; | |
_output_0[c3][18][1][t0] = output[3*b0 + c3][18][1 + 5*b2][t0]; | |
_output_0[c3][18][2][t0] = output[3*b0 + c3][18][2 + 5*b2][t0]; | |
_output_0[c3][18][3][t0] = output[3*b0 + c3][18][3 + 5*b2][t0]; | |
_output_0[c3][18][4][t0] = output[3*b0 + c3][18][4 + 5*b2][t0]; | |
_output_0[c3][19][0][t0] = output[3*b0 + c3][19][5*b2][t0]; | |
_output_0[c3][19][1][t0] = output[3*b0 + c3][19][1 + 5*b2][t0]; | |
_output_0[c3][19][2][t0] = output[3*b0 + c3][19][2 + 5*b2][t0]; | |
_output_0[c3][19][3][t0] = output[3*b0 + c3][19][3 + 5*b2][t0]; | |
_output_0[c3][19][4][t0] = output[3*b0 + c3][19][4 + 5*b2][t0]; | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 9; c3 += 8) { | |
for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) { | |
for (int c5 = 0; c5 <= min(19, -20 * b1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 4; c6 += 1) { | |
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[3*b0 + c4][20*b1 + c5][t0 + 10*b2 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) { | |
output[3*b0 + c3][20*b1][5*b2][t0] = _output_0[c3][0][0][t0]; | |
output[3*b0 + c3][20*b1][1 + 5*b2][t0] = _output_0[c3][0][1][t0]; | |
output[3*b0 + c3][20*b1][2 + 5*b2][t0] = _output_0[c3][0][2][t0]; | |
output[3*b0 + c3][20*b1][3 + 5*b2][t0] = _output_0[c3][0][3][t0]; | |
output[3*b0 + c3][20*b1][4 + 5*b2][t0] = _output_0[c3][0][4][t0]; | |
output[3*b0 + c3][1 + 20*b1][5*b2][t0] = _output_0[c3][1][0][t0]; | |
output[3*b0 + c3][1 + 20*b1][1 + 5*b2][t0] = _output_0[c3][1][1][t0]; | |
output[3*b0 + c3][1 + 20*b1][2 + 5*b2][t0] = _output_0[c3][1][2][t0]; | |
output[3*b0 + c3][1 + 20*b1][3 + 5*b2][t0] = _output_0[c3][1][3][t0]; | |
output[3*b0 + c3][1 + 20*b1][4 + 5*b2][t0] = _output_0[c3][1][4][t0]; | |
output[3*b0 + c3][2 + 20*b1][5*b2][t0] = _output_0[c3][2][0][t0]; | |
output[3*b0 + c3][2 + 20*b1][1 + 5*b2][t0] = _output_0[c3][2][1][t0]; | |
output[3*b0 + c3][2 + 20*b1][2 + 5*b2][t0] = _output_0[c3][2][2][t0]; | |
output[3*b0 + c3][2 + 20*b1][3 + 5*b2][t0] = _output_0[c3][2][3][t0]; | |
output[3*b0 + c3][2 + 20*b1][4 + 5*b2][t0] = _output_0[c3][2][4][t0]; | |
output[3*b0 + c3][3 + 20*b1][5*b2][t0] = _output_0[c3][3][0][t0]; | |
output[3*b0 + c3][3 + 20*b1][1 + 5*b2][t0] = _output_0[c3][3][1][t0]; | |
output[3*b0 + c3][3 + 20*b1][2 + 5*b2][t0] = _output_0[c3][3][2][t0]; | |
output[3*b0 + c3][3 + 20*b1][3 + 5*b2][t0] = _output_0[c3][3][3][t0]; | |
output[3*b0 + c3][3 + 20*b1][4 + 5*b2][t0] = _output_0[c3][3][4][t0]; | |
output[3*b0 + c3][4 + 20*b1][5*b2][t0] = _output_0[c3][4][0][t0]; | |
output[3*b0 + c3][4 + 20*b1][1 + 5*b2][t0] = _output_0[c3][4][1][t0]; | |
output[3*b0 + c3][4 + 20*b1][2 + 5*b2][t0] = _output_0[c3][4][2][t0]; | |
output[3*b0 + c3][4 + 20*b1][3 + 5*b2][t0] = _output_0[c3][4][3][t0]; | |
output[3*b0 + c3][4 + 20*b1][4 + 5*b2][t0] = _output_0[c3][4][4][t0]; | |
output[3*b0 + c3][5 + 20*b1][5*b2][t0] = _output_0[c3][5][0][t0]; | |
output[3*b0 + c3][5 + 20*b1][1 + 5*b2][t0] = _output_0[c3][5][1][t0]; | |
output[3*b0 + c3][5 + 20*b1][2 + 5*b2][t0] = _output_0[c3][5][2][t0]; | |
output[3*b0 + c3][5 + 20*b1][3 + 5*b2][t0] = _output_0[c3][5][3][t0]; | |
output[3*b0 + c3][5 + 20*b1][4 + 5*b2][t0] = _output_0[c3][5][4][t0]; | |
output[3*b0 + c3][6 + 20*b1][5*b2][t0] = _output_0[c3][6][0][t0]; | |
output[3*b0 + c3][6 + 20*b1][1 + 5*b2][t0] = _output_0[c3][6][1][t0]; | |
output[3*b0 + c3][6 + 20*b1][2 + 5*b2][t0] = _output_0[c3][6][2][t0]; | |
output[3*b0 + c3][6 + 20*b1][3 + 5*b2][t0] = _output_0[c3][6][3][t0]; | |
output[3*b0 + c3][6 + 20*b1][4 + 5*b2][t0] = _output_0[c3][6][4][t0]; | |
output[3*b0 + c3][7 + 20*b1][5*b2][t0] = _output_0[c3][7][0][t0]; | |
output[3*b0 + c3][7 + 20*b1][1 + 5*b2][t0] = _output_0[c3][7][1][t0]; | |
output[3*b0 + c3][7 + 20*b1][2 + 5*b2][t0] = _output_0[c3][7][2][t0]; | |
output[3*b0 + c3][7 + 20*b1][3 + 5*b2][t0] = _output_0[c3][7][3][t0]; | |
output[3*b0 + c3][7 + 20*b1][4 + 5*b2][t0] = _output_0[c3][7][4][t0]; | |
output[3*b0 + c3][8 + 20*b1][5*b2][t0] = _output_0[c3][8][0][t0]; | |
output[3*b0 + c3][8 + 20*b1][1 + 5*b2][t0] = _output_0[c3][8][1][t0]; | |
output[3*b0 + c3][8 + 20*b1][2 + 5*b2][t0] = _output_0[c3][8][2][t0]; | |
output[3*b0 + c3][8 + 20*b1][3 + 5*b2][t0] = _output_0[c3][8][3][t0]; | |
output[3*b0 + c3][8 + 20*b1][4 + 5*b2][t0] = _output_0[c3][8][4][t0]; | |
output[3*b0 + c3][9 + 20*b1][5*b2][t0] = _output_0[c3][9][0][t0]; | |
output[3*b0 + c3][9 + 20*b1][1 + 5*b2][t0] = _output_0[c3][9][1][t0]; | |
output[3*b0 + c3][9 + 20*b1][2 + 5*b2][t0] = _output_0[c3][9][2][t0]; | |
output[3*b0 + c3][9 + 20*b1][3 + 5*b2][t0] = _output_0[c3][9][3][t0]; | |
output[3*b0 + c3][9 + 20*b1][4 + 5*b2][t0] = _output_0[c3][9][4][t0]; | |
output[3*b0 + c3][10 + 20*b1][5*b2][t0] = _output_0[c3][10][0][t0]; | |
output[3*b0 + c3][10 + 20*b1][1 + 5*b2][t0] = _output_0[c3][10][1][t0]; | |
output[3*b0 + c3][10 + 20*b1][2 + 5*b2][t0] = _output_0[c3][10][2][t0]; | |
output[3*b0 + c3][10 + 20*b1][3 + 5*b2][t0] = _output_0[c3][10][3][t0]; | |
output[3*b0 + c3][10 + 20*b1][4 + 5*b2][t0] = _output_0[c3][10][4][t0]; | |
output[3*b0 + c3][11 + 20*b1][5*b2][t0] = _output_0[c3][11][0][t0]; | |
output[3*b0 + c3][11 + 20*b1][1 + 5*b2][t0] = _output_0[c3][11][1][t0]; | |
output[3*b0 + c3][11 + 20*b1][2 + 5*b2][t0] = _output_0[c3][11][2][t0]; | |
output[3*b0 + c3][11 + 20*b1][3 + 5*b2][t0] = _output_0[c3][11][3][t0]; | |
output[3*b0 + c3][11 + 20*b1][4 + 5*b2][t0] = _output_0[c3][11][4][t0]; | |
if (b1 == 0) { | |
output[3*b0 + c3][12][5*b2][t0] = _output_0[c3][12][0][t0]; | |
output[3*b0 + c3][12][1 + 5*b2][t0] = _output_0[c3][12][1][t0]; | |
output[3*b0 + c3][12][2 + 5*b2][t0] = _output_0[c3][12][2][t0]; | |
output[3*b0 + c3][12][3 + 5*b2][t0] = _output_0[c3][12][3][t0]; | |
output[3*b0 + c3][12][4 + 5*b2][t0] = _output_0[c3][12][4][t0]; | |
output[3*b0 + c3][13][5*b2][t0] = _output_0[c3][13][0][t0]; | |
output[3*b0 + c3][13][1 + 5*b2][t0] = _output_0[c3][13][1][t0]; | |
output[3*b0 + c3][13][2 + 5*b2][t0] = _output_0[c3][13][2][t0]; | |
output[3*b0 + c3][13][3 + 5*b2][t0] = _output_0[c3][13][3][t0]; | |
output[3*b0 + c3][13][4 + 5*b2][t0] = _output_0[c3][13][4][t0]; | |
output[3*b0 + c3][14][5*b2][t0] = _output_0[c3][14][0][t0]; | |
output[3*b0 + c3][14][1 + 5*b2][t0] = _output_0[c3][14][1][t0]; | |
output[3*b0 + c3][14][2 + 5*b2][t0] = _output_0[c3][14][2][t0]; | |
output[3*b0 + c3][14][3 + 5*b2][t0] = _output_0[c3][14][3][t0]; | |
output[3*b0 + c3][14][4 + 5*b2][t0] = _output_0[c3][14][4][t0]; | |
output[3*b0 + c3][15][5*b2][t0] = _output_0[c3][15][0][t0]; | |
output[3*b0 + c3][15][1 + 5*b2][t0] = _output_0[c3][15][1][t0]; | |
output[3*b0 + c3][15][2 + 5*b2][t0] = _output_0[c3][15][2][t0]; | |
output[3*b0 + c3][15][3 + 5*b2][t0] = _output_0[c3][15][3][t0]; | |
output[3*b0 + c3][15][4 + 5*b2][t0] = _output_0[c3][15][4][t0]; | |
output[3*b0 + c3][16][5*b2][t0] = _output_0[c3][16][0][t0]; | |
output[3*b0 + c3][16][1 + 5*b2][t0] = _output_0[c3][16][1][t0]; | |
output[3*b0 + c3][16][2 + 5*b2][t0] = _output_0[c3][16][2][t0]; | |
output[3*b0 + c3][16][3 + 5*b2][t0] = _output_0[c3][16][3][t0]; | |
output[3*b0 + c3][16][4 + 5*b2][t0] = _output_0[c3][16][4][t0]; | |
output[3*b0 + c3][17][5*b2][t0] = _output_0[c3][17][0][t0]; | |
output[3*b0 + c3][17][1 + 5*b2][t0] = _output_0[c3][17][1][t0]; | |
output[3*b0 + c3][17][2 + 5*b2][t0] = _output_0[c3][17][2][t0]; | |
output[3*b0 + c3][17][3 + 5*b2][t0] = _output_0[c3][17][3][t0]; | |
output[3*b0 + c3][17][4 + 5*b2][t0] = _output_0[c3][17][4][t0]; | |
output[3*b0 + c3][18][5*b2][t0] = _output_0[c3][18][0][t0]; | |
output[3*b0 + c3][18][1 + 5*b2][t0] = _output_0[c3][18][1][t0]; | |
output[3*b0 + c3][18][2 + 5*b2][t0] = _output_0[c3][18][2][t0]; | |
output[3*b0 + c3][18][3 + 5*b2][t0] = _output_0[c3][18][3][t0]; | |
output[3*b0 + c3][18][4 + 5*b2][t0] = _output_0[c3][18][4][t0]; | |
output[3*b0 + c3][19][5*b2][t0] = _output_0[c3][19][0][t0]; | |
output[3*b0 + c3][19][1 + 5*b2][t0] = _output_0[c3][19][1][t0]; | |
output[3*b0 + c3][19][2 + 5*b2][t0] = _output_0[c3][19][2][t0]; | |
output[3*b0 + c3][19][3 + 5*b2][t0] = _output_0[c3][19][3][t0]; | |
output[3*b0 + c3][19][4 + 5*b2][t0] = _output_0[c3][19][4][t0]; | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (5, 4)/10 Time (us): best: 604 median: 704 worst: 1052 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[2][32][10][11]; | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 1; c2 += 1) { | |
for (int c3 = 0; c3 <= 31; c3 += 1) { | |
_output_0[c2][c3][t1][t0] = output[2*b0 + c2][c3][t1][t0]; | |
} | |
} | |
__syncthreads(); | |
for (int c4 = 0; c4 <= 1; c4 += 1) { | |
for (int c5 = 0; c5 <= 31; c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][c5][c6][c7] = (_output_0[c4][c5][c6][c7] + input[2*b0 + c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 1; c2 += 1) { | |
for (int c3 = 0; c3 <= 31; c3 += 1) { | |
output[2*b0 + c2][c3][t1][t0] = _output_0[c2][c3][t1][t0]; | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 604 median: 704 worst: 1052 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[2][16][10][11]; | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 1; c2 += 1) { | |
for (int c3 = 0; c3 <= 15; c3 += 1) { | |
for (int c4 = t1; c4 <= 9; c4 += 3) { | |
_output_0[c2][c3][c4][t0] = output[2*b0 + c2][16*b1 + c3][c4][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 9; c2 += 4) { | |
for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) { | |
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) { | |
for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] = (_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 1; c2 += 1) { | |
for (int c3 = 0; c3 <= 15; c3 += 1) { | |
for (int c4 = t1; c4 <= 9; c4 += 3) { | |
output[2*b0 + c2][16*b1 + c3][c4][t0] = _output_0[c2][c3][c4][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 604 median: 704 worst: 1052 Generation 2 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 604 median: 704 worst: 1769 Generation 2 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 604 median: 704 worst: 1769 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[3][20][10][11]; | |
for (int c1 = 0; c1 <= 31; c1 += 20) { | |
__syncthreads(); | |
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) { | |
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) { | |
for (int c5 = t1; c5 <= 9; c5 += 5) { | |
for (int c6 = t0; c6 <= 9; c6 += 4) { | |
_output_0[c3][c4][c5][c6] = output[3*b0 + c3][c1 + c4][c5][c6]; | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 9; c3 += 8) { | |
for (int c4 = 0; c4 <= min(2, -3 * b0 + 3); c4 += 1) { | |
for (int c5 = 0; c5 <= min(19, -c1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= min(7, -c3 + 9); c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (t1 == 0 && t2 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][c5][c6][c3 + c7] = (_output_0[c4][c5][c6][c3 + c7] + input[3*b0 + c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c3 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) { | |
for (int c4 = t2; c4 <= min(19, -c1 + 31); c4 += 5) { | |
for (int c5 = t1; c5 <= 9; c5 += 5) { | |
for (int c6 = t0; c6 <= 9; c6 += 4) { | |
output[3*b0 + c3][c1 + c4][c5][c6] = _output_0[c3][c4][c5][c6]; | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 604 median: 704 worst: 1769 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c3 = 0; c3 <= 9; c3 += 8) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 3 * b1; c5 <= min(31, 3 * b1 + 2); c5 += 1) { | |
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) { | |
for (int c7 = c3; c7 <= min(9, c3 + 7); c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 2 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 604 median: 704 worst: 1769 Generation 2 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 559 median: 704 worst: 1769 Generation 2 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 559 median: 704 worst: 1769 | |
Generation 3 Job[Compiled, GPU] (3, 2)/10 Time (us): best: 558 median: 604 worst: 604 Generation 3 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 558 median: 604 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[4][5][5][11]; | |
__shared__ float32 _input_0[4][5][10][21]; | |
__syncthreads(); | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 9; c5 += 1) { | |
_input_0[c3][c4][c5][t0] = input[c3][5*b1 + c4][10*b2 + c5][t0]; | |
} | |
} | |
} | |
if (t0 <= 9) { | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
_output_0[c3][c4][c5][t0] = output[c3][5*b1 + c4][5*b2 + c5][t0]; | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 5 * b1; c5 <= min(31, 5 * b1 + 4); c5 += 1) { | |
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[c4][-5*b1 + c5][-5*b2 + c6][c7] = (_output_0[c4][-5*b1 + c5][-5*b2 + c6][c7] + _input_0[c4][-5*b1 + c5][t0 - 10*b2 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
if (t0 <= 9) { | |
for (int c3 = 0; c3 <= 3; c3 += 1) { | |
for (int c4 = 0; c4 <= min(4, -5 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
output[c3][5*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0]; | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (4, 3)/10 Time (us): best: 558 median: 604 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c1 = 0; c1 <= 31; c1 += 3) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= min(2, -c1 + 31); c5 += 1) { | |
for (int c6 = 0; c6 <= 9; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c1 + c5][c6][c7] = (output[c4][c1 + c5][c6][c7] + input[c4][c1 + c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 111 median: 604 worst: 640 Generation 3 Job[Compiled, GPU] (6, 5)/10 Time (us): best: 111 median: 604 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[2][16][10][11]; | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 1; c2 += 1) { | |
for (int c3 = 0; c3 <= 15; c3 += 1) { | |
for (int c4 = t1; c4 <= 9; c4 += 3) { | |
_output_0[c2][c3][c4][t0] = output[2*b0 + c2][16*b1 + c3][c4][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 9; c2 += 4) { | |
for (int c4 = 2 * b0; c4 <= 2 * b0 + 1; c4 += 1) { | |
for (int c5 = 16 * b1; c5 <= 16 * b1 + 15; c5 += 1) { | |
for (int c6 = c2; c6 <= min(9, c2 + 3); c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (t1 == 0 && c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] = (_output_0[-2*b0 + c4][-16*b1 + c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c2 = 0; c2 <= 1; c2 += 1) { | |
for (int c3 = 0; c3 <= 15; c3 += 1) { | |
for (int c4 = t1; c4 <= 9; c4 += 3) { | |
output[2*b0 + c2][16*b1 + c3][c4][t0] = _output_0[c2][c3][c4][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (7, 6)/10 Time (us): best: 111 median: 604 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
__shared__ float32 _output_0[3][20][5][11]; | |
__syncthreads(); | |
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) { | |
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
_output_0[c3][c4][c5][t0] = output[3*b0 + c3][20*b1 + c4][5*b2 + c5][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c4 = 3 * b0; c4 <= min(3, 3 * b0 + 2); c4 += 1) { | |
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) { | |
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
_output_0[-3*b0 + c4][-20*b1 + c5][-5*b2 + c6][c7] = (_output_0[-3*b0 + c4][-20*b1 + c5][-5*b2 + c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
__syncthreads(); | |
for (int c3 = 0; c3 <= min(2, -3 * b0 + 3); c3 += 1) { | |
for (int c4 = 0; c4 <= min(19, -20 * b1 + 31); c4 += 1) { | |
for (int c5 = 0; c5 <= 4; c5 += 1) { | |
output[3*b0 + c3][20*b1 + c4][5*b2 + c5][t0] = _output_0[c3][c4][c5][t0]; | |
} | |
} | |
} | |
__syncthreads(); | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (8, 7)/10 Time (us): best: 111 median: 604 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c4 = 3 * b0; c4 <= min(3, 3 * b0 + 2); c4 += 1) { | |
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) { | |
for (int c6 = 5 * b2; c6 <= 5 * b2 + 4; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (9, 8)/10 Time (us): best: 111 median: 558 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c1 = 8 * b1; c1 <= 31; c1 += 16) { | |
for (int c2 = 0; c2 <= 9; c2 += 4) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 0; c5 <= 7; c5 += 1) { | |
for (int c6 = 0; c6 <= min(3, -c2 + 9); c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c1 + c5][c2 + c6][c7] = (output[c4][c1 + c5][c2 + c6][c7] + input[c4][c1 + c5][t0 + 2*c2 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (10, 9)/10 Time (us): best: 111 median: 558 worst: 640 | |
template<typename T> inline __device__ T floord(T n, T d) { | |
return n < 0 ? - (-n + d - 1)/d : n / d; | |
} | |
// Halide type handling | |
typedef int int32; | |
typedef long int64; | |
typedef float float32; | |
typedef double float64; | |
extern "C" { | |
__global__ void avgpool_4_32_20_20(int32 B, int32 C, int32 H, int32 W, float32* poutput, float32* pinput) { | |
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | |
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | |
float32 (*output)[32][10][10] = reinterpret_cast<float32 (*)[32][10][10]>(poutput); | |
float32 (*input)[32][20][20] = reinterpret_cast<float32 (*)[32][20][20]>(pinput); | |
for (int c2 = 0; c2 <= 9; c2 += 5) { | |
for (int c4 = 0; c4 <= 3; c4 += 1) { | |
for (int c5 = 20 * b1; c5 <= min(31, 20 * b1 + 19); c5 += 1) { | |
for (int c6 = c2; c6 <= c2 + 4; c6 += 1) { | |
for (int c7 = 0; c7 <= 9; c7 += 1) { | |
for (int c8 = 0; c8 <= 3; c8 += 1) { | |
if (c8 >= 2 * t0 && 2 * t0 + 1 >= c8) { | |
output[c4][c5][c6][c7] = (output[c4][c5][c6][c7] + input[c4][c5][t0 + 2*c6][-2*t0 + 2*c7 + c8]); | |
} | |
__syncthreads(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
Generation 3 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 111 median: 558 worst: 640 Generation 3 Job[Compiled, GPU] (10, 10)/10 Time (us): best: 111 median: 558 worst: 640 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment