Last active
August 23, 2016 15:15
-
-
Save Mezzano/c3c3dbca6c47a50918f93c1fec3a7999 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Setting to use device 0 | |
Build Status = -2 ( Err = -11 ) | |
Log: (1387:0) : error : atomic function "atomic_cmpxchg" not supported | |
Sources: #ifndef __OPENCL_VERSION__ | |
#define __kernel | |
#define __global | |
#define __constant | |
#define __local | |
#define get_global_id(x) 0 | |
#define get_global_size(x) 0 | |
#define get_local_id(x) 0 | |
#define get_local_size(x) 0 | |
#define FLT_MAX 0 | |
#define FLT_MIN 0 | |
#define cl_khr_fp64 | |
#define cl_amd_fp64 | |
#define DOUBLE_SUPPORT_AVAILABLE | |
#define CLK_LOCAL_MEM_FENCE | |
#define CLK_GLOBAL_MEM_FENCE | |
#define Dtype float | |
#define barrier(x) | |
#define atomic_cmpxchg(x, y, z) x | |
#define signbit(x) x | |
#define int_tp long | |
#define uint_tp unsigned long | |
#define int_tpc long | |
#define uint_tpc unsigned long | |
#endif | |
#define CONCAT(A,B) A##_##B | |
#define TEMPLATE(name,type) CONCAT(name,type) | |
#define TYPE_FLOAT 1 | |
#define TYPE_DOUBLE 2 | |
#if defined(cl_khr_fp64) | |
#pragma OPENCL EXTENSION cl_khr_fp64 : enable | |
#define DOUBLE_SUPPORT_AVAILABLE | |
#elif defined(cl_amd_fp64) | |
#pragma OPENCL EXTENSION cl_amd_fp64 : enable | |
#define DOUBLE_SUPPORT_AVAILABLE | |
#endif | |
#if defined(cl_khr_int64_base_atomics) | |
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable | |
#define ATOMICS_64_AVAILABLE | |
#endif | |
// Types used for parameters, offset computations and so on | |
#define int_tp int | |
#define uint_tp unsigned int | |
// Definitions used to cast the types above as needed | |
#define int_tpc int | |
#define uint_tpc unsigned int | |
#define Dtype float | |
#define Dtype2 float2 | |
#define Dtype4 float4 | |
#define Dtype8 float8 | |
#define Dtype16 float16 | |
#define TYPE TYPE_FLOAT | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out, | |
Dtype negative_slope) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; | |
} | |
} | |
__kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff, | |
Dtype negative_slope) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = in_diff[index] | |
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope); | |
} | |
} | |
__kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = tanh(in[index]); | |
} | |
} | |
__kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* out_data, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
Dtype tanhx = out_data[index]; | |
out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); | |
} | |
} | |
__kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = 1.0 / (1.0 + exp(-in[index])); | |
} | |
} | |
__kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* out_data, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
const Dtype sigmoid_x = out_data[index]; | |
out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); | |
} | |
} | |
__kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] > threshold ? 1.0 : 0.0; | |
} | |
} | |
__kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels, | |
const int_tp dim, | |
__global const Dtype* in, | |
__global Dtype* out, | |
__global const Dtype* slope_data, | |
const int_tp div_factor) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
int_tp c = (index / dim) % channels / div_factor; | |
out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; | |
} | |
} | |
__kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels, | |
const int_tp dim, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff, | |
__global const Dtype* slope_data, | |
const int_tp div_factor) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
int_tp c = (index / dim) % channels / div_factor; | |
out_diff[index] = in_diff[index] | |
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * slope_data[c]); | |
} | |
} | |
__kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows, | |
const int_tp rowPitch, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0); | |
for (int k = 1; k < rows; k++) { | |
out_diff[index] += in_diff[index + k * rowPitch] | |
* in_data[index + k * rowPitch] | |
* (in_data[index + k * rowPitch] <= 0?1.0:0.0); | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index] = alpha; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim, | |
__global const Dtype* in, | |
__global const Dtype* permut, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / (inner_dim); | |
int_tp in_n = (int_tp) (permut[n]); | |
out[index] = in[in_n * (inner_dim) + index % (inner_dim)]; | |
} | |
} | |
__kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim, | |
__global const Dtype* in, | |
__global const Dtype* top_indexes, | |
__global const Dtype* begins, | |
__global const Dtype* counts, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / (inner_dim); | |
out[index] = 0; | |
int_tp lower = (int_tp) (begins[n]); | |
int_tp upper = lower + (int_tp) (counts[n]); | |
for (int_tp i = lower; i < upper; ++i) { | |
int_tp in_n = (int_tp) (top_indexes[i]); | |
out[index] += in[in_n * (inner_dim) + index % (inner_dim)]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) { | |
Dtype out = arg; | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const Dtype* bias, | |
const int_tp bias_dim, | |
const int_tp inner_dim, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp bias_index = (index / inner_dim) % bias_dim; | |
out[index] = in[index] + bias[bias_index]; | |
} | |
} | |
__kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const Dtype* scale, | |
const int_tp scale_dim, | |
const int_tp inner_dim, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp scale_index = (index / inner_dim) % scale_dim; | |
out[index] = in[index] * scale[scale_index]; | |
} | |
} | |
__kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const Dtype* scale, | |
__global const Dtype* bias, | |
const int_tp scale_dim, | |
const int_tp inner_dim, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp scale_index = (index / inner_dim) % scale_dim; | |
out[index] = in[index] * scale[scale_index] + bias[scale_index]; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
if (in[index] > 0.0f) { | |
out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index]))); | |
} else { | |
out[index] = log((Dtype) (1.0 + exp(in[index]))); | |
} | |
} | |
} | |
__kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff) { | |
Dtype kBNLL_THRESHOLD = 50.; | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD)); | |
out_diff[index] = in_diff[index] * expval / (expval + 1.); | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* data, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < num * spatial_dim; index += | |
get_global_size(0)) { | |
int_tp n = index / spatial_dim; | |
int_tp s = index % spatial_dim; | |
float maxval = -FLT_MAX; | |
for (int_tp c = 0; c < channels; ++c) { | |
maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval); | |
} | |
out[index] = maxval; | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num, | |
const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* channel_max, | |
__global Dtype* data) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / channels / spatial_dim; | |
int_tp s = index % spatial_dim; | |
data[index] -= channel_max[n * spatial_dim + s]; | |
} | |
} | |
__kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
out[index] = exp(data[index]); | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* data, | |
__global Dtype* channel_sum) { | |
for (int_tp index = get_global_id(0); index < num * spatial_dim; index += | |
get_global_size(0)) { | |
int_tp n = index / spatial_dim; | |
int_tp s = index % spatial_dim; | |
Dtype sum = 0; | |
for (int_tp c = 0; c < channels; ++c) { | |
sum += data[(n * channels + c) * spatial_dim + s]; | |
} | |
channel_sum[index] = sum; | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num, | |
const int_tp channels, const int_tp spatial_dim, | |
__global const Dtype* channel_sum, | |
__global Dtype* data) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / channels / spatial_dim; | |
int_tp s = index % spatial_dim; | |
data[index] /= channel_sum[n * spatial_dim + s]; | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* data_1, | |
__global const Dtype* data_2, | |
__global Dtype* channel_dot) { | |
for (int_tp index = get_global_id(0); index < num * spatial_dim; index += | |
get_global_size(0)) { | |
int_tp n = index / spatial_dim; | |
int_tp s = index % spatial_dim; | |
Dtype dot = 0; | |
for (int_tp c = 0; c < channels; ++c) { | |
dot += (data_1[(n * channels + c) * spatial_dim + s] | |
* data_2[(n * channels + c) * spatial_dim + s]); | |
} | |
channel_dot[index] = dot; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data, | |
const int forward, const int_tp num_concats, | |
const int_tp concat_size, | |
const int_tp top_concat_axis, | |
const int_tp bottom_concat_axis, | |
const int_tp offset_concat_axis, | |
__global Dtype* out_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp total_concat_size = concat_size * bottom_concat_axis; | |
const int_tp concat_num = index / total_concat_size; | |
const int_tp concat_index = index % total_concat_size; | |
const int_tp top_index = concat_index | |
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size; | |
if (forward == 1) { | |
out_data[top_index] = in_data[index]; | |
} else { | |
out_data[index] = in_data[top_index]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels, | |
const Dtype margin, const Dtype alpha, __global const Dtype* y, | |
__global const Dtype* diff, __global const Dtype* dist_sq, | |
__global Dtype *bottom_diff) { | |
for (int_tp i = get_global_id(0); i < count; | |
i += get_global_size(0)) { | |
int_tp n = i / channels; // the num index, to access y and dist_sq | |
if (trunc(y[n]) != 0.) { // similar pairs | |
bottom_diff[i] = alpha * diff[i]; | |
} else { // dissimilar pairs | |
Dtype mdist = 0.; | |
Dtype beta = 0.; | |
Dtype dist = sqrt(dist_sq[n]); | |
mdist = (margin - dist); | |
beta = -alpha * mdist / (dist + 1e-4) * diff[i]; | |
if (mdist > 0.) { | |
bottom_diff[i] = beta; | |
} else { | |
bottom_diff[i] = 0; | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels, | |
const Dtype margin, const Dtype alpha, __global Dtype* y, | |
__global Dtype* diff, __global Dtype* dist_sq, | |
__global Dtype* bottom_diff) { | |
for (int_tp i = get_global_id(0); i < count; | |
i += get_global_size(0)) { | |
int n = i / channels; // the num index, to access y and dist_sq | |
if (trunc(y[n]) != 0.) { // similar pairs | |
bottom_diff[i] = alpha * diff[i]; | |
} else { // dissimilar pairs | |
Dtype mdist = 0.; | |
Dtype beta = 0.; | |
mdist = (margin - dist_sq[n]); | |
beta = -alpha; | |
if (mdist > 0.) { | |
bottom_diff[i] = beta; | |
} else { | |
bottom_diff[i] = 0; | |
} | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) { | |
Dtype out = arg; | |
} | |
#define __CAT(x, y) x##y | |
#define CAT(x, y) __CAT(x, y) | |
#define LOOP1(VAR, STMT) (STMT); (VAR)++; | |
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) | |
#ifdef MULTI | |
__kernel void CFMulti(__global Dtype* image_data, int_tp image_offset, | |
__global Dtype* kernel_data, int_tp kernel_offset, | |
__global Dtype* bias,const int_tp bias_offset, | |
__global Dtype* convolved_image,const int_tp convolved_image_offset, | |
const ushort WIDTH, | |
const ushort HEIGHT, | |
const ushort OUTPUT_W, | |
const ushort OUTPUT_H) { | |
const int_tp outputX = get_global_id(0); | |
const int_tp outputY = get_global_id(1); | |
const int_tp kernelNum = get_global_id(2)*ZPAR; | |
if(outputX < OUTPUT_W && outputY < OUTPUT_H) | |
{ | |
Dtype sum[ZPAR]; | |
Dtype4 vectorSum[ZPAR]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
{ | |
sum[kern] = 0.0f; | |
vectorSum[kern] = (0.0f,0.0f,0.0f,0.0f); | |
} | |
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS; | |
const int_tp biasIndex=bias_offset + kernelNum; | |
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W; | |
const int_tp imageSize = WIDTH*HEIGHT; | |
const int_tp float4Reads = KERNEL_W / 4; | |
const int_tp floatReads = KERNEL_W % 4; | |
Dtype4 imageCache; | |
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset)); | |
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); | |
for(int_tp c = 0; c < CHANNELS; c++) | |
{ | |
for(int_tp y = 0; y < KERNEL_H; y++) | |
{ | |
for(int_tp x=0; x< float4Reads; x++) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[x]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
{ | |
vectorSum[kern] += imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[x]; | |
} | |
} | |
if(floatReads == 1) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
vectorSum[kern].s0 += ( imageCache * ( (__global Dtype4*) &(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]) )[float4Reads] ).s0; | |
} | |
else if(floatReads == 2) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
vectorSum[kern].s01 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s01; | |
} | |
else if(floatReads == 3) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
vectorSum[kern].s012 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s012; | |
} | |
image_dataPtrFloat += WIDTH; | |
kernel_dataPtrFloat += KERNEL_W; | |
} | |
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H; | |
} | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
sum[kern] = vectorSum[kern].x + vectorSum[kern].y + vectorSum[kern].z + vectorSum[kern].w; | |
if(APPLY_BIAS == 1) | |
{ | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
if(kernelNum+kern < OUTPUT_Z) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = | |
sum[kern] + bias[biasIndex +kern]; | |
} | |
else | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
if(kernelNum+kern < OUTPUT_Z) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = sum[kern]; | |
} | |
} | |
#endif | |
#ifdef MULTI_11 | |
__kernel void CFMulti_11_11_4(__global Dtype* image_data, int_tp image_offset, | |
__global Dtype* kernel_data, int_tp kernel_offset, | |
__global Dtype* bias,const int_tp bias_offset, | |
__global Dtype* convolved_image,const int_tp convolved_image_offset, | |
const ushort WIDTH, | |
const ushort HEIGHT, | |
const ushort OUTPUT_W, | |
const ushort OUTPUT_H) { | |
int_tp outputX = get_global_id(0)*XPAR; | |
int_tp outputY = get_global_id(1)*YPAR; | |
int_tp kernelNum = get_global_id(2)*ZPAR; | |
if(outputX < OUTPUT_W && outputY < OUTPUT_H) | |
{ | |
Dtype sum[XPAR*YPAR*ZPAR]; | |
for(int_tp kern =0; kern < XPAR*YPAR*ZPAR; kern++) | |
{ | |
sum[kern] = 0.0f; | |
} | |
int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS; | |
int_tp biasIndex=bias_offset + kernelNum; | |
int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W; | |
int_tp imageSize = WIDTH*HEIGHT; | |
int_tp index; | |
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset)); | |
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); | |
Dtype16 imageCache; | |
Dtype8 imageCacheR; | |
Dtype8 kernelCache; | |
Dtype4 kernelCacheR; | |
for(int_tp c = 0; c < CHANNELS; c++) | |
{ | |
for(int_tp y = 0; y < 11; y++) | |
{ | |
imageCache = ((__global Dtype16*)image_dataPtrFloat)[0]; | |
imageCacheR =((__global Dtype8*)image_dataPtrFloat)[2]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
{ | |
kernelCache = ((__global Dtype8*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0]; | |
kernelCacheR = ((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[2]; | |
index = kern*XPAR; | |
sum[index + 0] += dot(imageCache.S0123,kernelCache.S0123); | |
sum[index + 1] += dot(imageCache.S4567,kernelCache.S0123); | |
sum[index + 2] += dot(imageCache.S89AB,kernelCache.S0123); | |
sum[index + 3] += dot(imageCache.SCDEF,kernelCache.S0123); | |
sum[index + 0] += dot(imageCache.S4567,kernelCache.S4567); | |
sum[index + 1] += dot(imageCache.S89AB,kernelCache.S4567); | |
sum[index + 2] += dot(imageCache.SCDEF,kernelCache.S4567); | |
sum[index + 3] += dot(imageCacheR.S0123,kernelCache.S4567); | |
sum[index + 0] += dot(imageCache.S89A,kernelCacheR.S012); | |
sum[index + 1] += dot(imageCache.SCDE,kernelCacheR.S012); | |
sum[index + 2] += dot(imageCacheR.S012,kernelCacheR.S012); | |
sum[index + 3] += dot(imageCacheR.S456,kernelCacheR.S012); | |
} | |
image_dataPtrFloat += WIDTH; | |
kernel_dataPtrFloat += KERNEL_W; | |
} | |
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H; | |
} | |
if(APPLY_BIAS == 1) | |
{ | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
{ | |
for(int_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = | |
sum[kern*XPAR + wi] + bias[biasIndex +kern]; | |
} | |
} | |
else | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
for(int_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = sum[kern*XPAR + wi]; | |
} | |
} | |
#endif | |
#ifdef MULTI_GEN | |
__kernel void CFMulti_6(__global const Dtype* restrict image_data, const int_tp image_offset, | |
__global const Dtype* restrict kernel_data, const int_tp kernel_offset, | |
__global const Dtype* restrict bias,const int_tp bias_offset, | |
__global Dtype* restrict convolved_image,const int_tp convolved_image_offset, | |
const ushort WIDTH, | |
const ushort HEIGHT, | |
const ushort OUTPUT_W, | |
const ushort OUTPUT_H) { | |
const int_tp outputX = get_global_id(0)*XPAR; | |
const int_tp outputY = get_global_id(1)*YPAR; | |
const int_tp kernelNum = get_global_id(2)*ZPAR; | |
if(outputX < OUTPUT_W && outputY < OUTPUT_H) | |
{ | |
Dtype sum[XPAR*YPAR*ZPAR]; | |
for(uint_tp kern = 0; kern < XPAR*YPAR*ZPAR; kern++) | |
sum[kern] = 0.0f; | |
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS; | |
const int_tp biasIndex=bias_offset + kernelNum; | |
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W; | |
const int_tp imageSize = WIDTH*HEIGHT; | |
int_tp index; | |
__global const Dtype* image_dataPtrFloat[2]; | |
image_dataPtrFloat[0] = (image_data + (image_offset + local_image_offset)); | |
image_dataPtrFloat[1] = image_dataPtrFloat[0]; | |
__global const Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); | |
DTImage imageCache[YPAR]; | |
DTKernel kernelCache; | |
Dtype4 temp; | |
for(uint_tp c = 0; c < CHANNELS; c++) | |
{ | |
imageCache[0] = ((__global DTImage*)image_dataPtrFloat[1])[0]; | |
for(uint_tp preload = 1; preload < YPAR; preload++) | |
{ | |
image_dataPtrFloat[1] += WIDTH; | |
imageCache[preload] = ((__global DTImage*)image_dataPtrFloat[1])[0]; | |
} | |
int_tp y =0; | |
LOOP(KERNEL_H, y, | |
{ | |
int_tp kern=0; | |
LOOP(ZPAR, kern, | |
{ | |
kernelCache = ((__global DTKernel*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0]; | |
index = kern*XPAR*YPAR; | |
for(uint_tp y_par = 0; y_par < YPAR; y_par++) | |
{ | |
temp = floatDotV4(imageCache[y_par],kernelCache); | |
sum[index + y_par*XPAR + 0] += temp.s0; | |
sum[index + y_par*XPAR + 1] += temp.s1; | |
sum[index + y_par*XPAR + 2] += temp.s2; | |
sum[index + y_par*XPAR + 3] += temp.s3; | |
} | |
}); | |
kernel_dataPtrFloat += KERNEL_W; | |
for(uint_tp rotateData = 0; rotateData < YPAR - 1; rotateData++) | |
imageCache[rotateData] = imageCache[rotateData + 1]; | |
image_dataPtrFloat[1] += WIDTH; | |
imageCache[YPAR - 1] = ((__global DTImage*)image_dataPtrFloat[1])[0]; | |
}); | |
image_dataPtrFloat[0] += imageSize; | |
image_dataPtrFloat[1] = image_dataPtrFloat[0]; | |
} | |
if(APPLY_BIAS == 1) | |
{ | |
for(uint_tp kern = 0; kern < ZPAR; kern++) | |
{ | |
for(uint_tp hi =0; hi < YPAR; hi++) | |
for(uint_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY +hi)*OUTPUT_W + outputX + wi] = | |
sum[kern*XPAR*YPAR + XPAR*hi + wi] + bias[biasIndex +kern]; | |
} | |
} | |
else | |
for(uint_tp kern = 0; kern < ZPAR; kern++) | |
for(uint_tp hi =0; hi < YPAR; hi++) | |
for(uint_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY + hi)*OUTPUT_W + outputX + wi] = sum[kern*XPAR*YPAR +XPAR*hi +wi]; | |
} | |
} | |
#endif | |
//Begin IDLF kernels below here | |
#ifdef IDLF | |
#define activation_function(x) (x) | |
#if 0 | |
#define _IW INPUT_WIDTH | |
#define _IH INPUT_HEIGHT | |
#define _OW OUTPUT_WIDTH | |
#define _OH OUTPUT_HEIGHT | |
#endif | |
#define _ID INPUT_DEPTH | |
#define _OD NUM_FILTERS | |
#define FILTER_DEPTH INPUT_DEPTH | |
#define NUM_INPUT INPUT_DEPTH | |
#define NUM_OUTPUT NUM_FILTERS | |
#define KERNEL FILTER_WIDTH | |
// convolution stride, same for x and y | |
#define K_STRIDE STRIDEX | |
#ifndef IWPAD | |
#define IWPAD 0 | |
#endif | |
#ifndef IHPAD | |
#define IHPAD 0 | |
#endif | |
#define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT) | |
#ifndef MASTER_OUT_BLOCK_WIDTH | |
#define MASTER_OUT_BLOCK_WIDTH OUT_BLOCK_WIDTH | |
#endif | |
#ifndef MASTER_OUT_BLOCK_HEIGHT | |
#define MASTER_OUT_BLOCK_HEIGHT OUT_BLOCK_HEIGHT | |
#endif | |
// Each work-item computes a 4x6 region of one output map. | |
// Each work-group (which will be mapped to 1 SIMD16 EU thread) will compute 16 different feature maps, but each feature map is for the same 4x6 region of the imput image. | |
// NDRange: (_OW+pad)/ OUT_BLOCK_WIDTH, (_OH+pad)/OUT_BLOCK_HEIGHT, _OD/OUT_BLOCK_DEPTH | |
//#define SIMD_SIZE 16 | |
// NOTE: this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break. | |
#ifdef SIMD16 | |
#define TILE_X ((OUT_BLOCK_WIDTH - 1) * STRIDEX + KERNEL) | |
#define TILE_Y ((OUT_BLOCK_HEIGHT - 1) * STRIDEY + KERNEL) | |
#if (TILE_X % 4) != 0 | |
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) | |
kernel void | |
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs | |
__global float* inputs_base, | |
filter_qualifier float* weights_base, | |
__global float* biases_base, | |
__global float* outputs_base, | |
const ushort _IW, | |
const ushort _IH, | |
const ushort _OW, | |
const ushort _OH) | |
{ | |
__global float* outputs = outputs_base; | |
__global float* inputs = inputs_base; | |
filter_qualifier float* weights = weights_base; | |
__global float* biases = biases_base; | |
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column | |
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row | |
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth | |
uint_tp fmg = get_group_id(2); | |
uint_tp lid = get_local_id(2); | |
float in[IN_BUFFER_SIZE];// load 11x16 block of input data, really only need 11x15 for 4x6 outputs, but keep it simple. | |
//float out[24]; // 4x6 block of outputs that is SIMD_SIZE deep (along the Feature Map dimension). | |
float out[OUT_BLOCK_SIZE]; | |
uint_tp in_addr; | |
// find weights adress of given neuron (lid is index) | |
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid; | |
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) { | |
out[i]=0.0f; | |
} | |
uint_tp num_in_batch = fm / _OD; | |
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE; | |
for(int_tp kd = 0; kd < _ID; kd++) | |
{ | |
in_addr = input_batch_offset + (kd + INPUT_START_Z) * (_IH + IHPAD) * (_IW + IWPAD) + (or*K_STRIDE + INPUT_START_Y) * (_IW + IWPAD) + (oc*K_STRIDE + INPUT_START_X) + lid; | |
// read 11x16 input block into registers. | |
for(uint_tp reg = 0; reg < IN_BUFFER_SIZE; reg++) { | |
in[reg] = inputs[in_addr]; // read 16 elements | |
in_addr += (_IW + IWPAD);// move to next row down | |
} | |
// PREF could be 4 or 8, could not be other values. | |
#define WEIGHT_PREF 8 | |
union { | |
float w[WEIGHT_PREF]; | |
uint8 ui8; | |
} weight_buf; | |
int_tp w_idx=0; | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
uint_tp orig_weight_addr = weight_addr; | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; | |
int_tp kr = 0; // kr = Kernel Row | |
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop. | |
{ | |
int_tp kc = 0; // kc = Kernel Column | |
LOOP(KERNEL, kc, | |
{ | |
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) { | |
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) { | |
float input = intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc); | |
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); | |
} | |
} | |
// We assume KERNEL_W is equal to KERNEL_H here. | |
if ((w_idx + 1) % WEIGHT_PREF == 0 | |
#if KERNEL*KERNEL % 8 != 0 | |
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF)) | |
#endif | |
) { | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. | |
} | |
#if KERNEL*KERNEL % 8 == 0 | |
// need to do nothing | |
#else | |
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF))) | |
#if KERNEL*KERNEL % 8 == 1 | |
weight_buf.w[0] = weights[weight_addr]; | |
#elif KERNEL*KERNEL % 4 == 0 | |
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]); | |
#else | |
// should never be here if kernel_w equal to kernel_h. just in case. | |
#error unsupported kernel size. | |
#endif | |
#endif | |
++w_idx; | |
}); | |
}); | |
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE; | |
} | |
#ifdef IMAGE_AS_OUTPUT | |
// TODO: no ULT for that one yet! | |
uint_tp out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD)) * (_OW + OWPAD) * (_OH + OHPAD);// out_addr indexes into start of 16 feature maps. | |
#else | |
// we need this address calculation for outputs because we support views and batching | |
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD); | |
#endif | |
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on; | |
// we need this address calculation for biases because we support views and batching | |
float bias = biases[(fm) % _OD ]; | |
#ifndef WRITE_PADDED_VALUES | |
if(get_global_id(0) != (get_global_size(0)-1) && | |
get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
#endif | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
#ifndef WRITE_PADDED_VALUES | |
} else if ( get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
} | |
else if ( get_global_id(0) != (get_global_size(0)-1) ) | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
} | |
else | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(c,r*(_OW + OWPAD)),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
} | |
#endif //#ifndef WRITE_PADDED_VALUES | |
} | |
#endif | |
#if TILE_X % 4 == 0 | |
#define TILE_Y_STRIDE (64 / TILE_X) | |
#define INVEC_NUM ((TILE_Y + TILE_Y_STRIDE - 1) / TILE_Y_STRIDE) | |
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) | |
kernel void | |
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs | |
__global float* inputs_base, | |
filter_qualifier float* weights_base, | |
__global float* biases_base, | |
__global float* outputs_base, | |
const ushort _IW, | |
const ushort _IH, | |
const ushort _OW, | |
const ushort _OH) | |
{ | |
__global float* outputs = outputs_base; | |
__global float* inputs = inputs_base; | |
filter_qualifier float* weights = weights_base; | |
__global float* biases = biases_base; | |
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column | |
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row | |
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth | |
uint_tp fmg = get_group_id(2); | |
uint_tp lid = get_local_id(2); | |
float out[OUT_BLOCK_SIZE]; | |
uint_tp in_addr; | |
// find weights adress of given neuron (lid is index) | |
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid; | |
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) { | |
out[i]=0.0f; | |
} | |
uint_tp num_in_batch = ( fm ) / _OD; | |
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE; | |
in_addr = input_batch_offset + INPUT_START_Z * (_IH + IHPAD) * (_IW + IWPAD) + (or*STRIDEY + INPUT_START_Y) * (_IW + IWPAD) + (oc*STRIDEX + INPUT_START_X) | |
+ ( lid / ( TILE_X / 4 ) ) * (_IW + IWPAD) * STRIDEY // y tile offset | |
+ ( lid % ( TILE_X / 4 ) ) * 4 * STRIDEX; // x tile offset | |
for(int_tp kd = 0; kd < _ID; kd++) | |
{ | |
union { | |
float4 in_vec[INVEC_NUM]; | |
float in_array[INVEC_NUM * 4]; | |
} in_buf; | |
uint_tp in_offset = in_addr; | |
int_tp reg = 0; | |
#if INVEC_NUM == 1 | |
LOOP(1, reg, | |
#elif INVEC_NUM == 2 | |
LOOP(2, reg, | |
#elif INVEC_NUM == 3 | |
LOOP(3, reg, | |
#elif INVEC_NUM == 4 | |
LOOP(4, reg, | |
#else | |
#error too large invec_num. | |
#endif | |
{ | |
in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read 16 elements | |
in_offset += (_IW + IWPAD) * TILE_Y_STRIDE; | |
}); | |
in_addr += (_IH + IHPAD) * (_IW + IWPAD); | |
// PREF could be 4 or 8, could not be other values. | |
#define WEIGHT_PREF 8 | |
union { | |
float w[WEIGHT_PREF]; | |
uint8 ui8; | |
} weight_buf; | |
int_tp w_idx=0; | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
uint_tp orig_weight_addr = weight_addr; | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; | |
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) | |
int_tp kr = 0; // kr = Kernel Row | |
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop. | |
{ | |
int_tp kc = 0; // kc = Kernel Column | |
LOOP(KERNEL, kc, | |
{ | |
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) { | |
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) { | |
float input = BLOCK_IN((br * STRIDEY + kr) * TILE_X + bc * STRIDEX + kc);//intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc); | |
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); | |
} | |
} | |
// We assume KERNEL_W is equal to KERNEL_H here. | |
if ((w_idx + 1) % WEIGHT_PREF == 0 | |
#if KERNEL*KERNEL % 8 != 0 | |
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF)) | |
#endif | |
) { | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. | |
} | |
#if KERNEL*KERNEL % 8 == 0 | |
// need to do nothing | |
#else | |
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF))) | |
#if KERNEL*KERNEL % 8 == 1 | |
weight_buf.w[0] = weights[weight_addr]; | |
#elif KERNEL*KERNEL % 4 == 0 | |
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]); | |
#else | |
// should never be here if kernel_w equal to kernel_h. just in case. | |
#error unsupported kernel size. | |
#endif | |
#endif | |
++w_idx; | |
}); | |
}); | |
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE; | |
} | |
// we need this address calculation for outputs because we support views and batching | |
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD); | |
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on; | |
// we need this address calculation for biases because we support views and batching | |
float bias = biases[(fm) % _OD ]; | |
#ifndef WRITE_PADDED_VALUES | |
if(get_global_id(0) != (get_global_size(0)-1) && | |
get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
#endif | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
#ifndef WRITE_PADDED_VALUES | |
} else if ( get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
} | |
else if ( get_global_id(0) != (get_global_size(0)-1) ) | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
} | |
else | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
} | |
#endif //#ifndef WRITE_PADDED_VALUES | |
} | |
#endif // Stride > 2 | |
#endif | |
#endif | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(copyImage, Dtype) | |
(__global Dtype* image_data, | |
int_tp image_offset, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp adjustedHeight, const int_tp adjustedWidth, | |
const int_tp pad_h, const int_tp pad_w, | |
__global Dtype* output_image, | |
const int_tp output_offset, | |
const int_tp batch_size) { | |
uint_tp sX = get_global_id(0); | |
uint_tp sY = get_global_id(1); | |
uint_tp sZ = get_global_id(2); | |
int_tp in_y = sY - pad_h; | |
int_tp in_x = sX - pad_w; | |
int_tp batch_offset = 0; | |
int_tp adjusted_batch_offset = 0; | |
for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) { | |
int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX; | |
int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x; | |
if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width)) | |
output_image[dst_offset] = image_data[src_offset]; | |
else | |
output_image[dst_offset] = 0; | |
batch_offset += height * width * channels; | |
adjusted_batch_offset += adjustedHeight * adjustedWidth * channels; | |
} | |
} | |
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype) | |
(__global Dtype* weightIn, | |
__global Dtype* weightOut, | |
const int_tp kernel_w, | |
const int_tp kernel_h, | |
const int_tp channels, | |
const int_tp outputs, | |
const int_tp swizzleFactor) { | |
uint_tp sX = get_global_id(0); | |
//Original location | |
//Output location | |
int_tp outputSublayer = channels / swizzleFactor; | |
int_tp outputSublayerIndex = channels % swizzleFactor; | |
int_tp filter = sX / (kernel_w*kernel_h*channels); | |
int_tp kernel_X = sX % kernel_w; | |
int_tp kernel_Y = (sX / kernel_w) % kernel_h; | |
int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels; | |
int_tp FP = filter / swizzleFactor; | |
int_tp F1 = filter % swizzleFactor; | |
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] | |
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(crop_copy, Dtype)(const int_tp n, const int_tp height, | |
const int_tp width, | |
const int_tp src_outer_stride, | |
const int_tp src_inner_stride, | |
const int_tp dest_outer_stride, | |
const int_tp dest_inner_stride, | |
__global const Dtype* src, | |
const int_tp src_off, | |
__global Dtype* dest, | |
const int_tp dest_off) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
int_tp src_start = index / height * src_outer_stride | |
+ index % height * src_inner_stride; | |
int_tp dest_start = index / height * dest_outer_stride | |
+ index % height * dest_inner_stride; | |
for (int_tp i = 0; i < width; ++i) { | |
dest[dest_off + dest_start + i] = src[src_off + src_start + i]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const uint_tp* mask, | |
const uint_tp threshold, | |
const Dtype scale, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale; | |
} | |
} | |
__kernel void TEMPLATE(dropout_backward,Dtype)( | |
const int_tp n, __global const Dtype* in_diff, | |
__global const uint_tp* mask, const uint_tp threshold, | |
const Dtype scale, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(eltwise_max_forward,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data_a, | |
__global const Dtype* bottom_data_b, const int_tp blob_idx, | |
__global Dtype* top_data, | |
__global int_tp* mask) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
if (bottom_data_a[index] > bottom_data_b[index]) { | |
// only update for very first bottom_data blob (blob_idx == 0) | |
if (blob_idx == 0) { | |
maxval = bottom_data_a[index]; | |
top_data[index] = maxval; | |
maxidx = blob_idx; | |
mask[index] = maxidx; | |
} | |
} else { | |
maxval = bottom_data_b[index]; | |
top_data[index] = maxval; | |
maxidx = blob_idx + 1; | |
mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int_tp blob_idx, | |
__global const int_tp* mask, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
Dtype gradient = 0; | |
if (mask[index] == blob_idx) { | |
gradient += top_diff[index]; | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in, | |
__global Dtype* out, | |
Dtype alpha) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0); | |
} | |
} | |
__kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff, | |
__global const Dtype* out_data, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff, | |
Dtype alpha) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = | |
in_data[index] > 0 ? | |
in_diff[index] : in_diff[index] * (out_data[index] + alpha); | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads, | |
__global const Dtype* bottom_data, | |
__global const Dtype* weight, | |
const int_tp M, const int_tp N, | |
const int_tp K, | |
__global Dtype* top_data) { | |
for (int_tp top_index = get_global_id(0); top_index < nthreads; | |
top_index += get_global_size(0)) { | |
const int_tp n = top_index / N; | |
const int_tp d = top_index % N; | |
const int_tp index = (int_tp)(bottom_data[n]); | |
const int_tp weight_index = index * N + d; | |
top_data[top_index] = weight[weight_index]; | |
} | |
} | |
// atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html | |
#if (TYPE == TYPE_FLOAT) | |
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { | |
union { | |
uint_tp intVal; | |
Dtype floatVal; | |
} newVal; | |
union { | |
uint_tp intVal; | |
Dtype floatVal; | |
} prevVal; | |
do { | |
prevVal.floatVal = *source; | |
newVal.floatVal = prevVal.floatVal + operand; | |
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); | |
} | |
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, | |
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, | |
__global Dtype* weight_diff) { | |
for (int_tp top_index = get_global_id(0); top_index < nthreads; | |
top_index += get_global_size(0)) { | |
const int_tp n = top_index / N; | |
const int_tp d = top_index % N; | |
const int_tp index = (int_tp)(bottom_data[n]); | |
const int_tp weight_index = index * N + d; | |
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); | |
} | |
} | |
#endif | |
#if (TYPE == TYPE_DOUBLE) | |
#ifdef ATOMICS_64_AVAILABLE | |
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { | |
union { | |
unsigned long intVal; | |
Dtype floatVal; | |
} newVal; | |
union { | |
unsigned long intVal; | |
Dtype floatVal; | |
} prevVal; | |
do { | |
prevVal.floatVal = *source; | |
newVal.floatVal = prevVal.floatVal + operand; | |
} while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); | |
} | |
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, | |
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, | |
__global Dtype* weight_diff) { | |
for (int_tp top_index = get_global_id(0); top_index < nthreads; | |
top_index += get_global_size(0)) { | |
const int_tp n = top_index / N; | |
const int_tp d = top_index % N; | |
const int_tp index = (int_tp)(bottom_data[n]); | |
const int_tp weight_index = index * N + d; | |
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); | |
} | |
} | |
#endif | |
#endif | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(fft_phony,Dtype)(Dtype arg) { | |
Dtype out = arg; | |
} | |
#ifdef FFT | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
#define DtypeComplex Dtype2 | |
__kernel void TEMPLATE(copy2buffer_cyclic_shift_in,Dtype)( | |
__global Dtype* fft_gpu_weights_real, const int_tp offset_fft_gpu_weights_real, | |
__global Dtype* weight, const int_tp offset_weight, | |
const int_tp ker_size, const int_tp ch_gr, const int_tp ker_size_ch_gr, | |
const int_tp ker_w, const int_tp ker_c_h, const int_tp ker_c_w, | |
const int_tp fft_height, const int_tp fft_width, const int_tp complex_w_len) { | |
fft_gpu_weights_real += offset_fft_gpu_weights_real; | |
weight += offset_weight; | |
int_tp gId = get_global_id(0); | |
int_tp out = gId / ker_size_ch_gr; | |
int_tp c = (gId - out * ker_size_ch_gr) / ker_size; | |
int_tp map_offset = out * ch_gr + c; | |
int_tp map_offset_ker_size = map_offset * ker_size; | |
int_tp pos_in_map = gId - map_offset_ker_size; | |
int_tp h = pos_in_map / ker_w; | |
int_tp h_ker_w = h * ker_w; | |
int_tp w = pos_in_map - h_ker_w; | |
int_tp src_idx = map_offset_ker_size + h_ker_w + w; | |
int_tp ky = h - ker_c_h; | |
if (ky < 0) ky += fft_height; | |
int_tp kx = w - ker_c_w; | |
if (kx < 0) kx += fft_width; | |
int_tp dst_idx = (map_offset * fft_height + ky) * complex_w_len + kx; | |
fft_gpu_weights_real[dst_idx] = weight[src_idx]; | |
} | |
/* Use when width < 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_in_naive,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp size, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp h = gId / width; | |
int_tp w = gId - (h * width); | |
int_tp dst_idx = (h*stride_h + pad_h)*width_out + (w*stride_w + pad_w); | |
map_out[dst_idx] = map_in[gId]; | |
} | |
/* Use when width < 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_in_naive_2d,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp map_out_size, const int_tp size, const int_tp count, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId_x = get_global_id(0); | |
int_tp gId_y = get_global_id(1); | |
int_tp h = gId_x / width; | |
int_tp w = gId_x - (h * width); | |
int_tp src_idx = gId_y * size + gId_x; | |
int_tp dst_idx = gId_y * map_out_size + | |
(h * stride_h + pad_h) * width_out + (w * stride_w + pad_w); | |
map_out[dst_idx] = map_in[src_idx]; | |
} | |
/* Use when width >= 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_in,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp size, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp count = size >> 2; | |
int_tp gId4 = gId << 2; | |
int_tp h = gId4 / width; | |
int_tp w = gId4 - (h * width); | |
int_tp dst_h = h*stride_h + pad_h; | |
int_tp dst_w = w*stride_w + pad_w; | |
int_tp dst_idx = dst_h*width_out + dst_w; | |
if (gId < count) { | |
Dtype4 map_in_cache4 = vload4(gId, map_in); | |
int_tp has_pad = width - dst_w; | |
if (has_pad >= 4) { | |
vstore4(map_in_cache4, dst_idx >> 2, map_out); | |
} else { | |
if (0 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w; | |
} | |
map_out[dst_idx] = map_in_cache4.x; | |
if (1 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 1; | |
} | |
map_out[dst_idx+1] = map_in_cache4.y; | |
if (2 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 2; | |
} | |
map_out[dst_idx+2] = map_in_cache4.z; | |
if (3 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 3; | |
} | |
map_out[dst_idx+3] = map_in_cache4.w; | |
dst_h += 1; | |
dst_w = pad_w; | |
} | |
} else if (gId == count) { | |
int_tp res = size - (count << 2); /* size % 4 */ | |
if (res > 0) { | |
Dtype4 map_in_cache4 = 0.f; | |
if (res >= 1) | |
map_in_cache4.x = map_in[gId4]; | |
if (res >= 2) | |
map_in_cache4.y = map_in[gId4+1]; | |
if (res == 3) | |
map_in_cache4.z = map_in[gId4+2]; | |
int_tp has_pad = width - dst_w; | |
if (has_pad >= 4) { | |
vstore4(map_in_cache4, dst_idx >> 2, map_out); | |
} else { | |
if (0 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w; | |
} | |
map_out[dst_idx] = map_in_cache4.x; | |
if (1 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 1; | |
} | |
map_out[dst_idx+1] = map_in_cache4.y; | |
if (2 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 2; | |
} | |
map_out[dst_idx+2] = map_in_cache4.z; | |
if (3 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 3; | |
} | |
map_out[dst_idx+3] = map_in_cache4.w; | |
dst_h += 1; | |
dst_w = pad_w; | |
} | |
} | |
} | |
} | |
/* Use when width >= 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_in_2d,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp map_out_size, const int_tp size, const int_tp count, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp height, const int_tp width, const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp gId_y = get_global_id(1); | |
int_tp gId4 = gId << 2; | |
int_tp h = gId4 / width; | |
int_tp w = gId4 - (h * width); | |
int_tp dst_h = h*stride_h + pad_h; | |
int_tp dst_w = w*stride_w + pad_w; | |
int_tp dst_idx = dst_h*width_out + dst_w; | |
const __global Dtype* map_in_2d = map_in + gId_y * size; | |
__global Dtype* map_out_2d = map_out + gId_y * map_out_size; | |
if (gId < count) { | |
Dtype4 map_in_cache4 = vload4(gId, map_in_2d); | |
int_tp has_pad = width - dst_w; | |
if (has_pad >= 4) { | |
vstore4(map_in_cache4, dst_idx >> 2, map_out_2d); | |
} else { | |
if (0 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w; | |
} | |
map_out_2d[dst_idx] = map_in_cache4.x; | |
if (1 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 1; | |
} | |
map_out_2d[dst_idx+1] = map_in_cache4.y; | |
if (2 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 2; | |
} | |
map_out_2d[dst_idx+2] = map_in_cache4.z; | |
if (3 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 3; | |
} | |
map_out_2d[dst_idx+3] = map_in_cache4.w; | |
dst_h += 1; | |
dst_w = pad_w; | |
} | |
} else if (gId == count) { | |
int_tp res = size - (count << 2); /* size % 4 */ | |
if (res > 0) { | |
Dtype4 map_in_cache4 = 0.f; | |
if (res >= 1) | |
map_in_cache4.x = map_in_2d[gId4]; | |
if (res >= 2) | |
map_in_cache4.y = map_in_2d[gId4+1]; | |
if (res == 3) | |
map_in_cache4.z = map_in_2d[gId4+2]; | |
int_tp has_pad = width - dst_w; | |
if (has_pad >= 4) { | |
vstore4(map_in_cache4, dst_idx >> 2, map_out_2d); | |
} else { | |
if (0 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w; | |
} | |
map_out_2d[dst_idx] = map_in_cache4.x; | |
if (1 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 1; | |
} | |
map_out_2d[dst_idx+1] = map_in_cache4.y; | |
if (2 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 2; | |
} | |
map_out_2d[dst_idx+2] = map_in_cache4.z; | |
if (3 == has_pad) { | |
dst_idx += width_out + pad_w - dst_w - 3; | |
} | |
map_out_2d[dst_idx+3] = map_in_cache4.w; | |
dst_h += 1; | |
dst_w = pad_w; | |
} | |
} | |
} | |
} | |
/* Use when width_out < 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_out_naive,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp size, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp fft_height, const int_tp fft_width, | |
const int_tp ker_center_h, const int_tp ker_center_w, | |
const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp h_out = gId / width_out; | |
int_tp w_out = gId - (h_out * width_out); | |
int_tp h = h_out * stride_h + ker_center_h; | |
int_tp w = w_out * stride_w + ker_center_w; | |
int_tp src_idx = h*fft_width + w; | |
map_out[gId] = map_in[src_idx]; | |
} | |
/* Use when width_out < 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_out_naive_2d,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp size, const int_tp count, const int_tp map_in_size, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp fft_height, const int_tp fft_width, | |
const int_tp ker_center_h, const int_tp ker_center_w, | |
const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp h_out = gId / width_out; | |
int_tp w_out = gId - (h_out * width_out); | |
int_tp h = h_out * stride_h + ker_center_h; | |
int_tp w = w_out * stride_w + ker_center_w; | |
int_tp src_idx = out * map_in_size + h*fft_width + w; | |
int_tp dst_idx = out * size + gId; | |
map_out[dst_idx] = map_in[src_idx]; | |
} | |
/* Use when width_out >= 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_out,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp size, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp fft_height, const int_tp fft_width, | |
const int_tp ker_c_h, const int_tp ker_c_w, | |
const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp count = size >> 2; | |
int_tp gId4 = gId << 2; | |
int_tp h_out = gId4 / width_out; | |
int_tp w_out = gId4 - (h_out * width_out); | |
int_tp h = h_out * stride_h + ker_c_h; | |
int_tp w = w_out * stride_w + ker_c_w; | |
int_tp src_idx = h*fft_width + w; | |
if (gId < count) { | |
Dtype4 map_in_cache4; | |
int_tp has_pad = width_out - (w - pad_w); | |
if (has_pad >= 4) { | |
map_in_cache4 = vload4(src_idx >> 2, map_in); | |
} else { | |
int_tp right_elements = fft_width - width_out; | |
if (0 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.x = map_in[src_idx]; | |
if (1 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.y = map_in[src_idx+1]; | |
if (2 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.z = map_in[src_idx+2]; | |
if (3 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.w = map_in[src_idx+3]; | |
} | |
vstore4(map_in_cache4, gId, map_out); | |
} else if (gId == count) { | |
int_tp res = size - (count << 2); /* size % 4 */ | |
if (res > 0) { | |
for (int_tp i = gId4; i < size; ++i) { | |
map_out[i] = map_in[src_idx]; | |
src_idx++; | |
} | |
} | |
} | |
} | |
/* Use when width_out >= 4 */ | |
__kernel void TEMPLATE(copy2buffer_left_top_out_2d,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp size, const int_tp count, const int_tp map_in_size, | |
const int_tp height_out, const int_tp width_out, | |
const int_tp fft_height, const int_tp fft_width, | |
const int_tp ker_c_h, const int_tp ker_c_w, | |
const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp gId4 = gId << 2; | |
int_tp h_out = gId4 / width_out; | |
int_tp w_out = gId4 - (h_out * width_out); | |
int_tp h = h_out * stride_h + ker_c_h; | |
int_tp w = w_out * stride_w + ker_c_w; | |
int_tp src_idx = h*fft_width + w; | |
const __global Dtype* map_in_2d = map_in + out * map_in_size; | |
__global Dtype* map_out_2d = map_out + out * size; | |
if (gId < count) { | |
Dtype4 map_in_cache4; | |
int_tp has_pad = width_out - (w - pad_w); | |
if (has_pad >= 4) { | |
map_in_cache4 = vload4(src_idx >> 2, map_in_2d); | |
} else { | |
int_tp right_elements = fft_width - width_out; | |
if (0 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.x = map_in_2d[src_idx]; | |
if (1 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.y = map_in_2d[src_idx+1]; | |
if (2 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.z = map_in_2d[src_idx+2]; | |
if (3 == has_pad) { | |
src_idx += right_elements; | |
} | |
map_in_cache4.w = map_in_2d[src_idx+3]; | |
} | |
vstore4(map_in_cache4, gId, map_out_2d); | |
} else if (gId == count) { | |
int_tp res = size - (count << 2); /* size % 4 */ | |
if (res > 0) { | |
const __global Dtype4* map_in_2d_4 = | |
(const __global Dtype4*)(map_in_2d + src_idx); | |
__global Dtype4* map_out_2d_4 = (__global Dtype4*)(map_out_2d + gId4); | |
if (res == 3) { | |
map_out_2d_4[0].xyz = map_in_2d_4[0].xyz; | |
} else if (res == 2) { | |
map_out_2d_4[0].xy = map_in_2d_4[0].xy; | |
} else if (res == 1) { | |
map_out_2d_4[0].x = map_in_2d_4[0].x; | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(copy2buffer_cyclic_shift_out,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp width_out, | |
const int_tp fft_height, const int_tp fft_width, | |
const int_tp ker_center_h, const int_tp ker_center_w, | |
const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp h_out = gId / width_out; | |
int_tp w_out = gId - (h_out * width_out); | |
int_tp h = h_out * stride_h + pad_h; | |
int_tp w = w_out * stride_w + pad_w; | |
int_tp ky = h - ker_center_h; | |
if (ky < 0) ky += fft_height; | |
int_tp kx = w - ker_center_w; | |
if (kx < 0) kx += fft_width; | |
int_tp src_idx = ky*fft_width + kx; | |
map_out[gId] = map_in[src_idx]; | |
} | |
__kernel void TEMPLATE(copy2buffer_cyclic_shift_out_2d,Dtype)(__global Dtype* map_out, | |
const int_tp offset_map_out, | |
const __global Dtype* map_in, const int_tp offset_map_in, | |
const int_tp map_out_size, const int_tp map_in_size, | |
const int_tp width_out, | |
const int_tp fft_height, const int_tp fft_width, | |
const int_tp ker_center_h, const int_tp ker_center_w, | |
const int_tp stride_h, const int_tp stride_w, | |
const int_tp pad_h, const int_tp pad_w) { | |
map_out += offset_map_out; | |
map_in += offset_map_in; | |
int_tp gId = get_global_id(0); | |
int_tp gId_y = get_global_id(1); | |
int_tp h_out = gId / width_out; | |
int_tp w_out = gId - (h_out * width_out); | |
int_tp h = h_out * stride_h + pad_h; | |
int_tp w = w_out * stride_w + pad_w; | |
int_tp ky = h - ker_center_h; | |
if (ky < 0) ky += fft_height; | |
int_tp kx = w - ker_center_w; | |
if (kx < 0) kx += fft_width; | |
int_tp src_idx = gId_y * map_in_size + ky*fft_width + kx; | |
int_tp dst_idx = gId_y * map_out_size + gId; | |
map_out[dst_idx] = map_in[src_idx]; | |
} | |
__kernel void TEMPLATE(complex_conjugate_multiplication_1d,Dtype)(__global Dtype* dst, | |
const int_tp offset_dst, | |
const __global Dtype* src1, const int_tp offset_src1, | |
const __global Dtype* src2, const int_tp offset_src2, | |
const int_tp ch_gr) { | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp gId = get_global_id(0); | |
int_tp size = get_global_size(0); | |
Dtype4 dst_cache = 0.f; | |
int_tp src_idx; | |
Dtype4 s1_cache; | |
Dtype4 s2_cache; | |
for (int_tp c = 0; c < ch_gr; ++c) { | |
src_idx = size * c + gId; | |
s1_cache = vload4(src_idx, src1); | |
s2_cache = vload4(src_idx, src2); | |
dst_cache.x += s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y; | |
dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; | |
dst_cache.z += s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w; | |
dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z; | |
} | |
((__global Dtype4*)(&dst[gId<<2]))[0] += dst_cache; | |
} | |
__kernel void TEMPLATE(complex_conjugate_multiplication_2d,Dtype)(__global Dtype* dst, | |
const int_tp offset_dst, | |
const __global Dtype* src1, const int_tp offset_src1, | |
const __global Dtype* src2, const int_tp offset_src2, | |
const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) { | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp src1_idx, src2_idx; | |
int_tp dst_map_offset = map_size * out; | |
int_tp dst_idx = dst_map_offset + gId; | |
Dtype4 s1_cache, s2_cache; | |
Dtype4 dst_cache = 0.f; | |
int_tp map_offset = dst_map_offset * ch_gr; | |
for (int_tp i = 0; i < ch_gr; ++i) { | |
src1_idx = map_size * i + gId; | |
src2_idx = map_offset + src1_idx; | |
s1_cache = vload4(src1_idx, src1); | |
s2_cache = vload4(src2_idx, src2); | |
dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw); | |
dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz); | |
} | |
vstore4(dst_cache, dst_idx, dst); | |
} | |
__kernel void TEMPLATE(complex_conjugate_multiplication_2d_SLM,Dtype)( | |
__global Dtype* restrict dst, const int_tp offset_dst, | |
const __global Dtype* restrict src1, const int_tp offset_src1, | |
__local Dtype* local_src1, | |
const __global Dtype* restrict src2, const int_tp offset_src2, | |
const int_tp out_gr, const int_tp map_size, const int_tp ch_gr) { | |
int_tp gId = get_global_id(0); | |
if (gId >= map_size) return; /* Do not remove this */ | |
int_tp out = get_global_id(1); | |
if (out >= out_gr) return; /* Do not remove this */ | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp tId = get_local_id(0); | |
int_tp local_out = get_local_id(1); | |
int_tp tile_size = get_local_size(0); | |
Dtype4 s1_cache; | |
if (local_out == 0) { | |
for (int_tp c = 0; c < ch_gr; ++c) { | |
s1_cache = vload4(map_size * c + gId, src1); | |
vstore4(s1_cache, tile_size * c + tId, local_src1); | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int_tp dst_map_offset = map_size * out; | |
int_tp dst_idx = (dst_map_offset + gId) << 2; | |
Dtype4 dst_cache = 0.f; | |
Dtype4 s2_cache; | |
int_tp ch_offset = 0; | |
int_tp map_offset = dst_map_offset * ch_gr; | |
for (int_tp c = 0; c < ch_gr; ++c) { | |
ch_offset = map_size * c; | |
s1_cache = vload4(tile_size * c + tId, local_src1); | |
s2_cache = vload4(map_offset + ch_offset + gId, src2); | |
dst_cache.xz += mad( s1_cache.xz, s2_cache.xz, s1_cache.yw * s2_cache.yw); | |
dst_cache.yw += mad(-s1_cache.xz, s2_cache.yw, s1_cache.yw * s2_cache.xz); | |
} | |
((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache; | |
} | |
__kernel void TEMPLATE(complex_conjugate_multiplication_3d,Dtype)(__global Dtype* dst, | |
const int_tp offset_dst, | |
const __global Dtype* src1, const int_tp offset_src1, | |
const __global Dtype* src2, const int_tp offset_src2, | |
const int_tp out_gr, const int_tp size, const int_tp ch_gr) { | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp ch = get_global_id(2); | |
Dtype4 dst_cache = 0.f; | |
Dtype4 s1_cache = ((__global Dtype4*)(&(src1[(size*ch+gId)<<2])))[0]; | |
Dtype4 s2_cache = ((__global Dtype4*)(&(src2[(size*(out*ch_gr+ch)+gId)<<2])))[0]; | |
dst_cache.x = s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y; | |
dst_cache.y = -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; | |
dst_cache.z = s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w; | |
dst_cache.w = -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z; | |
((__global Dtype4*)(&dst[(size*out+gId)<<2]))[0] += dst_cache; | |
} | |
__kernel void TEMPLATE(complex_conjugate_multiplication_3d_SLM,Dtype)(__global Dtype* dst, | |
const int_tp offset_dst, __local Dtype* local_dst, | |
const __global Dtype* src1, const int_tp offset_src1, | |
__local Dtype* local_src1, const __global Dtype* src2, | |
const int_tp offset_src2, const int_tp out_gr, const int_tp map_size, | |
const int_tp ch_gr) { | |
int_tp gId = get_global_id(0); | |
if (gId >= map_size) return; /* Do not remove this */ | |
int_tp out = get_global_id(1); | |
if (out >= out_gr) return; /* Do not remove this */ | |
int_tp ch = get_global_id(2); | |
if (ch >= ch_gr) return; /* Do not remove this */ | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp tId = get_local_id(0); | |
int_tp local_out = get_local_id(1); | |
int_tp tile_size = get_local_size(0); | |
Dtype4 s1_cache; | |
if (local_out == 0) { | |
s1_cache = vload4(map_size * ch + gId, src1); | |
vstore4(s1_cache, tile_size * ch + tId, local_src1); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int_tp dst_map_offset = map_size * out; | |
int_tp dst_idx = (dst_map_offset + gId) << 2; | |
Dtype4 dst_cache = 0.f; | |
Dtype4 s2_cache; | |
s1_cache = vload4(tile_size * ch + tId, local_src1); | |
s2_cache = vload4((dst_map_offset * ch_gr) + (map_size * ch) + gId, src2); | |
dst_cache.x += s1_cache.x * s2_cache.x + s1_cache.y * s2_cache.y; | |
dst_cache.y += -s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; | |
dst_cache.z += s1_cache.z * s2_cache.z + s1_cache.w * s2_cache.w; | |
dst_cache.w += -s1_cache.z * s2_cache.w + s1_cache.w * s2_cache.z; | |
((__global Dtype4*)(&dst[dst_idx]))[0] += dst_cache; | |
} | |
__kernel void TEMPLATE(complex_multiplication_1d,Dtype)(__global Dtype* dst, | |
const int_tp offset_dst, | |
const __global Dtype* src1, const int_tp offset_src1, | |
const __global Dtype* src2, const int_tp offset_src2, | |
const int_tp size, const int_tp ch_gr) { | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp gId = get_global_id(0); | |
Dtype4 s2_cache; | |
Dtype4 dst_cache = 0.f; | |
int_tp idx_with_ch; | |
Dtype4 s1_cache = vload4(gId, src1); | |
for (int_tp ch = 0; ch < ch_gr; ++ch) { | |
idx_with_ch = size * ch + gId; | |
s2_cache = vload4(idx_with_ch, src2); | |
dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw; | |
dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz; | |
((__global Dtype4*)(&dst[idx_with_ch<<2]))[0] += dst_cache; | |
} | |
} | |
__kernel void TEMPLATE(complex_multiplication_2d_SLM,Dtype)(__global Dtype* restrict dst, | |
const int_tp offset_dst, __local Dtype* local_dst, | |
const __global Dtype* restrict src1, const int_tp offset_src1, | |
const __global Dtype* restrict src2, const int_tp offset_src2, | |
const int_tp num_output, const int_tp size, const int_tp ch_gr) { | |
int_tp gId = get_global_id(0); | |
if (gId >= size) return; | |
int_tp out = get_global_id(1); | |
if (out >= num_output) return; | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp tId = get_local_id(0); | |
int_tp tOut = get_local_id(1); | |
int_tp tile_size = get_local_size(0); | |
int_tp local_out_size = get_local_size(1); | |
int_tp out_offset = out * size; | |
int_tp out_ch_offset = out_offset * ch_gr; | |
int_tp tile_size_in_all_ch = tile_size * ch_gr; | |
int_tp local_out_ch_offset = tOut * tile_size_in_all_ch; | |
int_tp src2_idx, local_dst_idx; | |
Dtype4 s2_cache, dst_cache; | |
int_tp src1_idx = out_offset + gId; | |
Dtype4 s1_cache = vload4(src1_idx, src1); | |
for (int_tp ch = 0; ch < ch_gr; ++ch) { | |
src2_idx = out_ch_offset + ch * size + gId; | |
s2_cache = vload4(src2_idx, src2); | |
dst_cache.xz = s1_cache.xz * s2_cache.xz - s1_cache.yw * s2_cache.yw; | |
dst_cache.yw = s1_cache.xz * s2_cache.yw + s1_cache.yw * s2_cache.xz; | |
local_dst_idx = local_out_ch_offset + ch * tile_size + tId; | |
vstore4(dst_cache, local_dst_idx, local_dst); | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int_tp start_idx, half_start_idx; | |
int_tp ch_offset; | |
int_tp this_idx, that_idx; | |
for (int_tp offset = local_out_size >>= 1; offset > 0; offset >>=1) { | |
if (tOut < offset) { | |
start_idx = tOut * tile_size_in_all_ch + tId; | |
half_start_idx = (tOut + offset) * tile_size_in_all_ch + tId; | |
for (int_tp ch = 0; ch < ch_gr; ++ch) { | |
ch_offset = ch * tile_size; | |
this_idx = (start_idx + ch_offset) << 2; | |
that_idx = (half_start_idx + ch_offset) << 2; | |
((__local Dtype4*)(&local_dst[this_idx]))[0] += | |
((__local Dtype4*)(&local_dst[that_idx]))[0]; | |
} | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
} | |
if (tOut == 0) { | |
for (int_tp ch = 0; ch < ch_gr; ++ch) { | |
dst_cache = vload4(tile_size * ch + tId, local_dst); | |
((__global Dtype4*)(&dst[(size * ch + gId)<<2]))[0] += dst_cache; | |
} | |
} | |
} | |
__kernel void TEMPLATE(complex_multiplication_3d,Dtype)(__global Dtype* dst, | |
const int_tp offset_dst, | |
const __global Dtype* src1, const int_tp offset_src1, | |
const __global Dtype* src2, const int_tp offset_src2, | |
const int_tp size, const int_tp ch_gr, const int_tp out_gr, const int_tp num_output) { | |
dst += offset_dst; | |
src1 += offset_src1; | |
src2 += offset_src2; | |
int_tp gId = get_global_id(0); | |
int_tp ch = get_global_id(1); | |
int_tp out = get_global_id(2); | |
int_tp g = out / out_gr; | |
ch += (g * ch_gr); | |
int_tp c_offset = ch - ((ch / ch_gr) * ch_gr); | |
__global Dtype2* dst_ch = ((__global Dtype2*)(dst)) + (size * ch); | |
__global Dtype2* src1_out = ((__global Dtype2*)(src1)) + (size * out); | |
__global Dtype2* src2_out_ch = ((__global Dtype2*)(src2)) + (size * (out * ch_gr + c_offset)); | |
Dtype2 s1_cache = src1_out[gId]; | |
Dtype2 s2_cache = src2_out_ch[gId]; | |
Dtype2 dst_cache = 0.f; | |
dst_cache.x = s1_cache.x * s2_cache.x - s1_cache.y * s2_cache.y; | |
dst_cache.y = s1_cache.x * s2_cache.y + s1_cache.y * s2_cache.x; | |
dst_ch[gId] += dst_cache; | |
} | |
/* Convert [RRRR...GGGG...BBBB...] to [RGBRGBRGBRGB...] */ | |
/* Reshape 2 */ | |
__kernel void TEMPLATE(convert_data_to_channel_major,Dtype)(__global Dtype2* dst, | |
const __global Dtype2* src, const int_tp size, const int_tp ch_gr) { | |
int_tp gId = get_global_id(0); | |
__global Dtype* dst_ptr = (__global Dtype*)(dst + (gId * ch_gr)); | |
const __global Dtype* src_ptr = (const __global Dtype*)(src + gId); | |
Dtype2 s; | |
int_tp src_idx = 0; | |
for (int_tp i = 0; i < ch_gr; ++i) { | |
s = vload2(src_idx, src_ptr); | |
vstore2(s, i, dst_ptr); | |
src_idx += size; | |
} | |
} | |
/* Reshape 1 */ | |
/*__kernel void TEMPLATE(convert_data_to_channel_major(__global Dtype4* dst, | |
const __global Dtype4* src, const int_tp size, const int_tp ch_gr) { | |
int_tp gId = get_global_id(0); | |
const __global Dtype4* src_ptr4 = src + gId; | |
__global Dtype4* dst_ptr4 = dst + (gId * ch_gr); | |
for (int_tp i = 0; i < ch_gr; ++i) { | |
dst_ptr4[i] = src_ptr4[i*size]; | |
} | |
} | |
*/ | |
/* Convert multiple [RRRR...GGGG...BBBB...] to multiple [RGBRGBRGBRGB...] */ | |
/* Reshape 2 */ | |
__kernel void TEMPLATE(convert_weight_to_channel_major,Dtype)(__global Dtype2* dst, | |
const __global Dtype2* src, const int_tp size, const int_tp ch_gr, | |
const int_tp num_output) { | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp out_offset = out * (size * ch_gr); | |
__global Dtype* dst_ptr = (__global Dtype*)(dst + out_offset + (gId * ch_gr)); | |
const __global Dtype* src_ptr = | |
(const __global Dtype*)(src + out_offset + gId); | |
Dtype2 s; | |
int_tp src_idx = 0; | |
for (int_tp i = 0; i < ch_gr; ++i) { | |
s = vload2(src_idx, src_ptr); | |
vstore2(s, i, dst_ptr); | |
src_idx += size; | |
} | |
} | |
/* Reshape 1 */ | |
/* | |
__kernel void TEMPLATE(convert_weight_to_channel_major(__global Dtype4* dst, | |
const __global Dtype4* src, const int_tp size, const int_tp ch_gr, | |
const int_tp out_gr) { | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp out_offset = out * (size * ch_gr); | |
__global Dtype4* dst_ptr4 = dst + out_offset + (gId * ch_gr); | |
const __global Dtype4* src_ptr4 = src + out_offset + gId; | |
for (int_tp i = 0; i < ch_gr; ++i) { | |
dst_ptr4[i] = src_ptr4[size * i]; | |
} | |
} | |
*/ | |
/* Cdotc per element */ | |
/* Reshape 1 */ | |
/* | |
__kernel void TEMPLATE(batchedCdotc(__global Dtype4* dst, | |
const __global Dtype4* src1, const __global Dtype4* src2, | |
const int_tp size, const int_tp ch_gr, const int_tp out_gr) { | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp ch_offset = gId * ch_gr; | |
int_tp out_offset = out * size; | |
const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset); | |
const __global Dtype* src2_ptr = (const __global Dtype*)(src2 + (out_offset * ch_gr) + ch_offset); | |
Dtype4 cdotc = 0.f; | |
Dtype4 s1, s2; | |
for (int_tp c = 0; c < ch_gr; ++c) { | |
s1 = vload4(c, src1_ptr); | |
s2 = vload4(c, src2_ptr); | |
cdotc.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw); | |
cdotc.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz); | |
} | |
__global Dtype4* dst_ptr4 = dst + out_offset + gId; | |
dst_ptr4[0] += cdotc; | |
} | |
*/ | |
/* Cdotc per two elements */ | |
/* Reshape 2 */ | |
__kernel void TEMPLATE(batchedCdotc,Dtype)(__global Dtype2* dst, | |
const __global Dtype2* src1, const __global Dtype2* src2, | |
const int_tp size, const int_tp ch_gr, const int_tp out_gr) { | |
int_tp gId = get_global_id(0); | |
int_tp out = get_global_id(1); | |
int_tp ch_offset = gId * ch_gr; | |
const __global Dtype* src1_ptr = (const __global Dtype*)(src1 + ch_offset); | |
const __global Dtype* src2_ptr = | |
(const __global Dtype*)(src2 + (out * size * ch_gr) + ch_offset); | |
Dtype4 cdotc4 = 0.f; | |
Dtype2 cdotc = 0.f; | |
Dtype4 s1, s2; | |
int_tp n = ch_gr >> 1; | |
int_tp r = ch_gr - (n << 1); | |
for (int_tp i = 0; i < n; ++i) { | |
s1 = vload4(i, src1_ptr); | |
s2 = vload4(i, src2_ptr); | |
cdotc4.xz += mad( s1.xz, s2.xz, s1.yw * s2.yw); | |
cdotc4.yw += mad(-s1.xz, s2.yw, s1.yw * s2.xz); | |
} | |
cdotc.x += dot(cdotc4.xz, (float2)(1)); | |
cdotc.y += dot(cdotc4.yw, (float2)(1)); | |
if (r == 1) { | |
const __global Dtype* src1_ptr2 = | |
(const __global Dtype*)(((const __global Dtype4*)(src1_ptr)) + n); | |
const __global Dtype* src2_ptr2 = | |
(const __global Dtype*)(((const __global Dtype4*)(src2_ptr)) + n); | |
Dtype2 t1 = vload2(0, src1_ptr2); | |
Dtype2 t2 = vload2(0, src2_ptr2); | |
cdotc.x += mad( t1.x, t2.x, t1.y * t2.y); | |
cdotc.y += mad(-t1.x, t2.y, t1.y * t2.x); | |
} | |
__global Dtype* dst_ptr = (__global Dtype*)(dst + (out * size) + gId); | |
vstore2(cdotc, 0, dst_ptr); | |
} | |
#endif | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x, | |
const int_tp offx) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
x[index + offx] = alpha; | |
} | |
} | |
__kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x, | |
const int_tp offx) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
x[index + offx] = alpha; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(im2col,Dtype)(const int_tp n, | |
__global const Dtype* data_im, | |
const int_tp data_im_off, | |
const int_tp height, const int_tp width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp pad_h, | |
const int_tp pad_w, const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp dilation_h, | |
const int_tp dilation_w, | |
const int_tp height_col, | |
const int_tp width_col, | |
__global Dtype* data_col, | |
const int_tp data_col_off) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp h_index = index / width_col; | |
const int_tp h_col = h_index % height_col; | |
const int_tp w_col = index % width_col; | |
const int_tp c_im = h_index / height_col; | |
const int_tp c_col = c_im * kernel_h * kernel_w; | |
const int_tp h_offset = h_col * stride_h - pad_h; | |
const int_tp w_offset = w_col * stride_w - pad_w; | |
__global Dtype* data_col_ptr = data_col + data_col_off; | |
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; | |
__global const Dtype* data_im_ptr = data_im + data_im_off; | |
data_im_ptr += (c_im * height + h_offset) * width + w_offset; | |
for (int_tp i = 0; i < kernel_h; ++i) { | |
for (int_tp j = 0; j < kernel_w; ++j) { | |
int_tp h_im = h_offset + i * dilation_h; | |
int_tp w_im = w_offset + j * dilation_w; | |
*data_col_ptr = | |
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? | |
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; | |
data_col_ptr += height_col * width_col; | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(col2im,Dtype)(const int_tp n, | |
__global const Dtype* data_col, | |
const int_tp data_col_off, | |
const int_tp height, const int_tp width, | |
const int_tp channels, | |
const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp pad_h, | |
const int_tp pad_w, const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp dilation_h, | |
const int_tp dilation_w, | |
const int_tp height_col, | |
const int_tp width_col, | |
__global Dtype* data_im, | |
const int_tp data_im_off) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
Dtype val = 0; | |
const int_tp w_im = index % width + pad_w; | |
const int_tp h_im = (index / width) % height + pad_h; | |
const int_tp c_im = index / (width * height); | |
int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1; | |
int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1; | |
// compute the start and end of the output | |
const int_tp w_col_start = | |
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; | |
const int_tp w_col_end = min(w_im / stride_w + 1, width_col); | |
const int_tp h_col_start = | |
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; | |
const int_tp h_col_end = min(h_im / stride_h + 1, height_col); | |
// TODO: use LCM of stride and dilation to avoid unnecessary loops | |
for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) { | |
for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) { | |
int_tp h_k = (h_im - h_col * stride_h); | |
int_tp w_k = (w_im - w_col * stride_w); | |
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { | |
h_k /= dilation_h; | |
w_k /= dilation_w; | |
int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * | |
height_col + h_col) * width_col + w_col; | |
val += data_col[data_col_off + data_col_index]; | |
} | |
} | |
} | |
data_im[data_im_off + index] = val; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes, | |
const int_tp channel_axis, | |
__global const Dtype* data_im, | |
const int_tp data_im_off, | |
__global const int_tp* im_shape, | |
__global const int_tp* col_shape, | |
__global const int_tp* kernel_shape, | |
__global const int_tp* pad, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global Dtype* data_col, | |
const int_tp data_col_off) { | |
int_tp d_temp[6]; | |
int_tp d_iter[6]; | |
int_tp i; | |
__global const int_tp* im_shape_ptr = im_shape + channel_axis; | |
__global const int_tp* col_shape_ptr = col_shape + channel_axis; | |
__local int_tp shared_dilation[6]; | |
__local int_tp shared_kernel_shape[6]; | |
__local int_tp shared_pad[6]; | |
__local int_tp shared_stride[6]; | |
__local int_tp shared_col_shape[6 + 1]; | |
__local int_tp shared_im_shape[6 + 1]; | |
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { | |
shared_dilation[li] = dilation[li]; | |
shared_kernel_shape[li] = kernel_shape[li]; | |
shared_pad[li] = pad[li]; | |
shared_stride[li] = stride[li]; | |
} | |
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { | |
shared_col_shape[li] = col_shape_ptr[li]; | |
shared_im_shape[li] = im_shape_ptr[li]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
// Initialize channel_in, computed in the loop below, with intermediate | |
// computations used to compute the spatial indices. | |
int_tp channel_in = index; | |
int_tp channel_out = 1; | |
for (i = num_axes - 1; i >= 0; --i) { | |
d_temp[i] = channel_in % shared_col_shape[i + 1]; | |
channel_in /= shared_col_shape[i + 1]; | |
channel_out *= shared_kernel_shape[i]; | |
} | |
channel_out *= channel_in; | |
int_tp data_col_inc = 1; | |
for (i = 0; i < num_axes; ++i) { | |
channel_out *= shared_col_shape[i + 1]; | |
channel_out += d_temp[i]; | |
d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i]; | |
channel_in *= shared_im_shape[i + 1]; | |
channel_in += d_temp[i]; | |
data_col_inc *= shared_col_shape[i + 1]; | |
d_iter[i] = 0; | |
} | |
__global Dtype* data_col_ptr = data_col + data_col_off + channel_out; | |
__global const Dtype* data_im_ptr = data_im + data_im_off + channel_in; | |
bool incremented; | |
do { | |
bool in_range = true; | |
for (i = 0; i < num_axes; ++i) { | |
const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i]; | |
in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1]; | |
if (!in_range) { | |
break; | |
} | |
} | |
if (in_range) { | |
int_tp data_im_offset = d_iter[0] * shared_dilation[0]; | |
for (i = 1; i < num_axes; ++i) { | |
data_im_offset *= shared_im_shape[i + 1]; | |
data_im_offset += d_iter[i] * shared_dilation[i]; | |
} | |
*data_col_ptr = data_im_ptr[data_im_offset]; | |
} else { | |
*data_col_ptr = 0; | |
} | |
data_col_ptr += data_col_inc; | |
incremented = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
const int_tp d_max = shared_kernel_shape[i]; | |
if (d_iter[i] == d_max - 1) { | |
d_iter[i] = 0; | |
} else { // d_iter[i] < d_max - 1 | |
++d_iter[i]; | |
incremented = true; | |
break; | |
} | |
} // for (int_tp i = num_axes - 1; i >= 0; --i) | |
} while (incremented); // do | |
} | |
} | |
__kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes, | |
const int_tp channel_axis, | |
__global const Dtype* data_col, | |
const int_tp data_col_off, | |
__global const int_tp* im_shape, | |
__global const int_tp* col_shape, | |
__global const int_tp* kernel_shape, | |
__global const int_tp* pad, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global Dtype* data_im, | |
const int_tp data_im_off) { | |
int_tp d_im[6]; | |
int_tp d_col_iter[6]; | |
int_tp d_col_start[6]; | |
int_tp d_col_end[6]; | |
__global const int_tp* im_shape_ptr = im_shape + channel_axis; | |
__global const int_tp* col_shape_ptr = col_shape + channel_axis; | |
__local int_tp shared_dilation[6]; | |
__local int_tp shared_kernel_shape[6]; | |
__local int_tp shared_pad[6]; | |
__local int_tp shared_stride[6]; | |
__local int_tp shared_col_shape[6 + 1]; | |
__local int_tp shared_im_shape[6 + 1]; | |
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { | |
shared_dilation[li] = dilation[li]; | |
shared_kernel_shape[li] = kernel_shape[li]; | |
shared_pad[li] = pad[li]; | |
shared_stride[li] = stride[li]; | |
} | |
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { | |
shared_col_shape[li] = col_shape_ptr[li]; | |
shared_im_shape[li] = im_shape_ptr[li]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
// Initialize channel_in, computed in the loop below, with intermediate | |
// computations used to compute the spatial indices. | |
int_tp c_im = index; | |
// Calculate d_im (image dimensions). | |
for (int_tp i = num_axes - 1; i >= 0; --i) { | |
d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i]; | |
c_im /= shared_im_shape[i + 1]; | |
} | |
// Calculate col start/end indices. | |
bool done = false; | |
for (int_tp i = 0; i < num_axes; ++i) { | |
const int_tp kernel_extent = shared_dilation[i] | |
* (shared_kernel_shape[i] - 1) + 1; | |
d_col_start[i] = d_col_iter[i] = | |
(d_im[i] < kernel_extent) ? | |
0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1; | |
d_col_end[i] = min(d_im[i] / shared_stride[i] + 1, | |
shared_col_shape[i + 1]); | |
if (d_col_start[i] >= d_col_end[i]) { | |
// Skip computation if the dimension is 0 at any spatial axis -- | |
// final val will be 0. | |
data_im[index] = 0; | |
done = true; | |
break; // for (int_tp i = 0; i < num_axes; ++i) | |
} | |
} | |
if (!done) { | |
// Loop over the col to compute the output val. | |
Dtype val = 0; | |
bool incremented = true; | |
bool skip = false; | |
do { | |
// Compute the final offset. | |
int_tp final_offset = 0; | |
int_tp kernel_shape_prod = 1; | |
int_tp kernel_index; | |
for (int_tp i = num_axes - 1; i >= 0; --i) { | |
kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i]; | |
if (kernel_index % shared_dilation[i]) { | |
skip = true; | |
break; | |
} else { | |
kernel_index /= shared_dilation[i]; | |
final_offset += kernel_index * kernel_shape_prod; | |
kernel_shape_prod *= shared_kernel_shape[i]; | |
} | |
} | |
if (!skip) { | |
final_offset += kernel_shape_prod * c_im; | |
for (int_tp i = 0; i < num_axes; ++i) { | |
final_offset *= shared_col_shape[i + 1]; | |
final_offset += d_col_iter[i]; | |
} | |
val += data_col[data_col_off + final_offset]; | |
} | |
skip = false; | |
incremented = false; | |
for (int_tp i = num_axes - 1; i >= 0; --i) { | |
const int_tp d_max = d_col_end[i]; | |
if (d_col_iter[i] == d_max - 1) { | |
d_col_iter[i] = d_col_start[i]; | |
} else { // d_col_iter[i] < d_max - 1 | |
++d_col_iter[i]; | |
incremented = true; | |
break; // for (int_tp i = num_axes - 1; i >= 0; --i) | |
} | |
} // for (int_tp i = num_axes - 1; i >= 0; --i) | |
} while (incremented); | |
data_im[data_im_off + index] = val; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads, | |
__global const Dtype* in, | |
__global const Dtype* scale, | |
const Dtype negative_beta, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
out[index] = in[index] * pow(scale[index], negative_beta); | |
} | |
} | |
__kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in, | |
const int_tp num, const int_tp channels, | |
const int_tp height, const int_tp width, const int_tp size, | |
const Dtype alpha_over_size, const Dtype k, | |
__global Dtype* const scale) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp n = index / width / height; | |
const int_tp offset = (n * channels * height + h) * width + w; | |
const int_tp step = height * width; | |
__global const Dtype* in_off = in + offset; | |
__global Dtype* scale_off = scale + offset; | |
int_tp head = 0; | |
const int_tp pre_pad = (size - 1) / 2; | |
const int_tp post_pad = size - pre_pad - 1; | |
Dtype accum_scale = 0; | |
// fill the scale at [n, :, h, w] | |
// accumulate values | |
while (head < post_pad && head < channels) { | |
accum_scale += in_off[head * step] * in_off[head * step]; | |
++head; | |
} | |
// both add and subtract | |
while (head < channels) { | |
accum_scale += in_off[head * step] * in_off[head * step]; | |
if (head - size >= 0) { | |
accum_scale -= in_off[(head - size) * step] | |
* in_off[(head - size) * step]; | |
} | |
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; | |
++head; | |
} | |
// subtract only | |
while (head < channels + post_pad) { | |
if (head - size >= 0) { | |
accum_scale -= in_off[(head - size) * step] | |
* in_off[(head - size) * step]; | |
} | |
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; | |
++head; | |
} | |
} | |
} | |
__kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads, | |
__global const Dtype* bottom_data, | |
__global const Dtype* top_data, | |
__global const Dtype* scale, | |
__global const Dtype* top_diff, const int_tp num, | |
const int_tp channels, const int_tp height, | |
const int_tp width, const int_tp size, | |
const Dtype negative_beta, | |
const Dtype cache_ratio, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp n = index / width / height; | |
const int_tp offset = (n * channels * height + h) * width + w; | |
const int_tp step = height * width; | |
__global const Dtype* bottom_off = bottom_data + offset; | |
__global const Dtype* top_off = top_data + offset; | |
__global const Dtype* scale_off = scale + offset; | |
__global const Dtype* top_diff_off = top_diff + offset; | |
__global Dtype* bottom_diff_off = bottom_diff + offset; | |
int_tp head = 0; | |
const int_tp pre_pad = size - (size + 1) / 2; | |
const int_tp post_pad = size - pre_pad - 1; | |
Dtype accum_ratio = 0; | |
// accumulate values | |
while (head < post_pad && head < channels) { | |
accum_ratio += top_diff_off[head * step] * top_off[head * step] | |
/ scale_off[head * step]; | |
++head; | |
} | |
// both add and subtract | |
while (head < channels) { | |
accum_ratio += top_diff_off[head * step] * top_off[head * step] | |
/ scale_off[head * step]; | |
if (head - size >= 0) { | |
accum_ratio -= top_diff_off[(head - size) * step] | |
* top_off[(head - size) * step] / scale_off[(head - size) * step]; | |
} | |
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) | |
* step] * pow(scale_off[(head - post_pad) * step], negative_beta) | |
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; | |
++head; | |
} | |
// subtract only | |
while (head < channels + post_pad) { | |
if (head - size >= 0) { | |
accum_ratio -= top_diff_off[(head - size) * step] | |
* top_off[(head - size) * step] / scale_off[(head - size) * step]; | |
} | |
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) | |
* step] * pow(scale_off[(head - post_pad) * step], negative_beta) | |
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; | |
++head; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) { | |
return (Dtype)1 / ((Dtype)1 + exp(-x)); | |
} | |
inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) { | |
return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1; | |
} | |
__kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* X, __global Dtype* X_acts) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp x_dim = 4 * dim; | |
const int_tp d = index % x_dim; | |
if (d < 3 * dim) { | |
X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]); | |
} else { | |
X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]); | |
} | |
} | |
} | |
__kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont, | |
__global Dtype* C, __global Dtype* H) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp n = index / dim; | |
const int_tp d = index % dim; | |
__global const Dtype* X_offset = X + 4 * dim * n; | |
const Dtype i = X_offset[d]; | |
const Dtype f = X_offset[1 * dim + d]; | |
const Dtype o = X_offset[2 * dim + d]; | |
const Dtype g = X_offset[3 * dim + d]; | |
const Dtype c_prev = C_prev[index]; | |
const Dtype c = cont[n] * f * c_prev + i * g; | |
C[index] = c; | |
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); | |
H[index] = o * tanh_c; | |
} | |
} | |
__kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H, | |
__global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff, | |
__global Dtype* C_prev_diff, __global Dtype* X_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp n = index / dim; | |
const int_tp d = index % dim; | |
__global const Dtype* X_offset = X + 4 * dim * n; | |
const Dtype i = X_offset[d]; | |
const Dtype f = X_offset[1 * dim + d]; | |
const Dtype o = X_offset[2 * dim + d]; | |
const Dtype g = X_offset[3 * dim + d]; | |
const Dtype c_prev = C_prev[index]; | |
const Dtype c = C[index]; | |
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); | |
__global Dtype* c_prev_diff = C_prev_diff + index; | |
__global Dtype* X_diff_offset = X_diff + 4 * dim * n; | |
__global Dtype* i_diff = X_diff_offset + d; | |
__global Dtype* f_diff = X_diff_offset + 1 * dim + d; | |
__global Dtype* o_diff = X_diff_offset + 2 * dim + d; | |
__global Dtype* g_diff = X_diff_offset + 3 * dim + d; | |
const Dtype c_term_diff = | |
C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); | |
const Dtype cont_n = cont[n]; | |
*c_prev_diff = cont_n * c_term_diff * f; | |
*i_diff = c_term_diff * g; | |
*f_diff = cont_n * c_term_diff * c_prev; | |
*o_diff = H_diff[index] * tanh_c; | |
*g_diff = c_term_diff * i; | |
} | |
} | |
__kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp x_dim = 4 * dim; | |
const int_tp d = index % x_dim; | |
const Dtype X_act = X_acts[index]; | |
if (d < 3 * dim) { | |
X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act); | |
} else { | |
X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act); | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, | |
__global Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = a[index + offa] * b[index + offb]; | |
} | |
} | |
__kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, | |
__global Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = a[index + offa] / b[index + offb]; | |
} | |
} | |
__kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha, | |
__global Dtype* Y, | |
const int_tp offY) { | |
for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) { | |
Y[offY + index] += alpha; | |
} | |
} | |
__kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global const Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = a[offa + index] + b[offb + index]; | |
} | |
} | |
__kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global const Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = a[offa + index] - b[offb + index]; | |
} | |
} | |
__kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = fabs((Dtype)(a[offa + index])); | |
} | |
} | |
__kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = exp(a[offa + index]); | |
} | |
} | |
__kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = log((Dtype)(a[offa + index])); | |
} | |
} | |
__kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, Dtype alpha, | |
__global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
if(alpha == 2.0) { | |
y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha); | |
} else { | |
y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha); | |
} | |
} | |
} | |
__kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x, | |
const int_tp offx, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = (0.0 < x[index + offx]) | |
- (x[index + offx] < 0.0); | |
} | |
} | |
__kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x, | |
const int_tp offx, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = signbit(x[index + offx]); | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global const Dtype* bottom_a, | |
const int_tp forward_a, | |
__global const Dtype* bottom_b, | |
const int_tp forward_b, | |
__global Dtype* top, | |
const int_tp num, | |
const int_tp channels_a, | |
const int_tp channels_b, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
int_tp batch_id = index / ((channels_a + channels_b) * size_a); | |
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) | |
/ (channels_a * size_a)) % 2; | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
if (bottom_id == 0) { | |
int_tp channel_id = (index / size_a) % channels_a; | |
int_tp aidx = batch_id * channels_a + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
top[index] = (forward_a == 1) ? bottom_a[aidx] : 0; | |
} else { | |
int_tp channel_id = (index / size_a) % channels_b; | |
int_tp bidx = (batch_id * channels_b + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
top[index] = (forward_b == 1) ? bottom_b[bidx] : 0; | |
} | |
} | |
} | |
__kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global Dtype* bottom_a, | |
const int_tp backward_a, | |
__global Dtype* bottom_b, | |
const int_tp backward_b, | |
__global const Dtype* top, | |
const int_tp num, | |
const int_tp channels_a, | |
const int_tp channels_b, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp batch_id = index / ((channels_a + channels_b) * size_a); | |
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) | |
/ (channels_a * size_a)) % 2; | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
if (bottom_id == 0) { | |
int_tp channel_id = (index / size_a) % channels_a; | |
int_tp aidx = batch_id * channels_a + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
bottom_a[aidx] = (backward_a == 1) ? top[index] : 0; | |
} else { | |
int_tp channel_id = (index / size_a) % channels_b; | |
int_tp bidx = (batch_id * channels_b + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
bottom_b[bidx] = (backward_b == 1) ? top[index] : 0; | |
} | |
} | |
} | |
__kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global const Dtype* bottom_a, | |
const int_tp forward_a, | |
__global const Dtype* bottom_b, | |
const int_tp forward_b, | |
__global Dtype* top, | |
const int_tp num, | |
const int_tp channels, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp batch_id = index / (channels * size_a); | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
top[index] = 0; | |
int_tp channel_id = (index / size_a) % channels; | |
int_tp aidx = batch_id * channels + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index]; | |
int_tp bidx = (batch_id * channels + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index]; | |
} | |
} | |
__kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global Dtype* bottom_a, | |
const int_tp backward_a, | |
__global Dtype* bottom_b, | |
const int_tp backward_b, | |
__global const Dtype* top, | |
const int_tp num, | |
const int_tp channels, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp batch_id = index / (channels * size_a); | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
int_tp channel_id = (index / size_a) % channels; | |
int_tp aidx = batch_id * channels + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
bottom_a[aidx] = backward_a ? top[index] : 0; | |
int_tp bidx = (batch_id * channels + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
bottom_b[bidx] = backward_b ? top[index] : 0; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(max_pool_forward,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* top_data, | |
const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
const int_tp hend = min(hstart + kernel_h, height); | |
const int_tp wend = min(wstart + kernel_w, width); | |
hstart = max(hstart, (int_tp)0); | |
wstart = max(wstart, (int_tp)0); | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
if (bottom_slice[h * width + w] > maxval) { | |
maxidx = h * width + w; | |
maxval = bottom_slice[maxidx]; | |
} | |
} | |
} | |
top_data[index] = maxval; | |
if (use_mask == 1) { | |
mask[index] = maxidx; | |
} else { | |
top_mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(ave_pool_forward,Dtype)( | |
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, | |
const int_tp pad_w, __global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
{ | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + kernel_h, height + pad_h); | |
int_tp wend = min(wstart + kernel_w, width + pad_w); | |
const int_tp pool_size = (hend - hstart) * (wend - wstart); | |
hstart = max(hstart, (int_tp)0); | |
wstart = max(wstart, (int_tp)0); | |
hend = min(hend, height); | |
wend = min(wend, width); | |
Dtype aveval = 0; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
aveval += bottom_slice[h * width + w]; | |
} | |
} | |
top_data[index] = aveval / pool_size; | |
} | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_train,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, | |
__global Dtype* rand_idx, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
const int_tp hstart = ph * stride_h; | |
const int_tp hend = min(hstart + kernel_h, height); | |
const int_tp wstart = pw * stride_w; | |
const int_tp wend = min(wstart + kernel_w, width); | |
Dtype cumsum = 0.; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
cumsum += bottom_slice[h * width + w]; | |
} | |
} | |
const float thres = rand_idx[index] * cumsum; | |
// Second pass: get value, and set index. | |
cumsum = 0; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
cumsum += bottom_slice[h * width + w]; | |
if (cumsum >= thres) { | |
rand_idx[index] = ((n * channels + c) * height + h) * width + w; | |
top_data[index] = bottom_slice[h * width + w]; | |
h = hend; | |
w = wend; | |
} | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)( | |
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
const int_tp hstart = ph * stride_h; | |
const int_tp hend = min(hstart + kernel_h, height); | |
const int_tp wstart = pw * stride_w; | |
const int_tp wend = min(wstart + kernel_w, width); | |
// We set cumsum to be 0 to avoid divide-by-zero problems | |
Dtype cumsum = FLT_MIN; | |
Dtype cumvalues = 0.; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
cumsum += bottom_slice[h * width + w]; | |
cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; | |
} | |
} | |
top_data[index] = cumvalues / cumsum; | |
} | |
} | |
__kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int use_mask, | |
__global const int_tp* mask, | |
__global const Dtype* top_mask, | |
const int_tp num, | |
const int_tp channels, | |
const int_tp height, | |
const int_tp width, | |
const int_tp pooled_height, | |
const int_tp pooled_width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, | |
const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp c = (index / width / height) % channels; | |
const int_tp n = index / width / height / channels; | |
const int_tp phstart = | |
(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; | |
const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height); | |
const int_tp pwstart = | |
(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; | |
const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width); | |
Dtype gradient = 0; | |
const int_tp offset = (n * channels + c) * pooled_height * pooled_width; | |
__global const Dtype* top_diff_slice = top_diff + offset; | |
if (use_mask == 1) { | |
__global const int_tp* mask_slice = mask + offset; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
if (mask_slice[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_slice[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} else { | |
__global const Dtype* top_mask_slice = top_mask + offset; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_slice[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int_tp num, | |
const int_tp channels, | |
const int_tp height, | |
const int_tp width, | |
const int_tp pooled_height, | |
const int_tp pooled_width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, | |
const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
const int_tp w = index % width + pad_w; | |
const int_tp h = (index / width) % height + pad_h; | |
const int_tp c = (index / width / height) % channels; | |
const int_tp n = index / width / height / channels; | |
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; | |
const int_tp phend = min(h / stride_h + 1, pooled_height); | |
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; | |
const int_tp pwend = min(w / stride_w + 1, pooled_width); | |
Dtype gradient = 0.0; | |
__global const Dtype* const top_diff_slice = top_diff | |
+ (n * channels + c) * pooled_height * pooled_width; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
// figure out the pooling size | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + kernel_h, height + pad_h); | |
int_tp wend = min(wstart + kernel_w, width + pad_w); | |
int_tp pool_size = (hend - hstart) * (wend - wstart); | |
gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_backward,Dtype)( | |
const int_tp nthreads, __global const Dtype* rand_idx, | |
__global const Dtype* const top_diff, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, | |
const int_tp stride_w, __global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp c = (index / width / height) % channels; | |
const int_tp n = index / width / height / channels; | |
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; | |
const int_tp phend = min(h / stride_h + 1, pooled_height); | |
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; | |
const int_tp pwend = min(w / stride_w + 1, pooled_width); | |
Dtype gradient = 0.0; | |
__global const Dtype* rand_idx_slice = rand_idx | |
+ (n * channels + c) * pooled_height * pooled_width; | |
__global const Dtype* top_diff_slice = top_diff | |
+ (n * channels + c) * pooled_height * pooled_width; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
gradient += top_diff_slice[ph * pooled_width + pw] | |
* (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0); | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n, | |
const int_tp num_axes, | |
__global const Dtype* bottom_data, | |
const int_tp channels, | |
__global const int_tp* size, | |
__global const int_tp* pooled_size, | |
__global const int_tp* kernel_size, | |
__global const int_tp* ext_kernel_size, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global const int_tp* pad, | |
__global Dtype* top_data, | |
const int use_mask, | |
__global int_tp* mask, __global Dtype* top_mask) { | |
int_tp d_idx[6]; | |
int_tp d_start[6]; | |
int_tp d_end[6]; | |
int_tp d_iter[6]; | |
int_tp i; | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
int_tp offset = 1; | |
int_tp num = index; | |
bool do_continue = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
d_idx[i] = num % pooled_size[i]; | |
d_start[i] = d_idx[i] * stride[i] - pad[i]; | |
d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]); | |
d_start[i] = max(d_start[i], (int_tp)0); | |
num /= pooled_size[i]; | |
offset *= size[i]; | |
d_iter[i] = d_start[i]; | |
if (d_start[i] >= d_end[i]) { | |
top_data[index] = -FLT_MAX; | |
if (use_mask) { | |
mask[index] = -1; | |
} else { | |
top_mask[index] = -1; | |
} | |
do_continue = true; | |
} | |
} | |
if(do_continue) { | |
continue; | |
} | |
int_tp chan = num % channels; | |
num /= channels; | |
offset *= (num * channels + chan); | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
int_tp final_offset = 0; | |
bool incremented; | |
do { | |
final_offset = offset; | |
int_tp size_prod = 1; | |
for (i = num_axes - 1; i >= 0; --i) { | |
final_offset += d_iter[i] * size_prod; | |
size_prod *= size[i]; | |
} | |
if (bottom_data[final_offset] > maxval) { | |
maxidx = final_offset; | |
maxval = bottom_data[maxidx]; | |
} | |
incremented = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
if (d_iter[i] >= d_end[i] - dilation[i]) { | |
d_iter[i] = d_start[i]; | |
} else { | |
d_iter[i] += dilation[i]; | |
incremented = true; | |
break; | |
} | |
} | |
} while (incremented); | |
top_data[index] = maxval; | |
if (use_mask == 1) { | |
mask[index] = maxidx; | |
} else { | |
top_mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n, | |
const int_tp num_axes, | |
__global const Dtype* top_diff, | |
const int use_mask, | |
__global const int_tp* mask, | |
__global const Dtype* top_mask, | |
const int_tp channels, | |
__global const int_tp* size, | |
__global const int_tp* pooled_size, | |
__global const int_tp* kernel_size, | |
__global const int_tp* ext_kernel_size, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global const int_tp* pad, | |
__global Dtype* bottom_diff) { | |
int_tp d_idx[6]; | |
int_tp d_start[6]; | |
int_tp d_end[6]; | |
int_tp d_iter[6]; | |
int_tp i; | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
int_tp offset = 1; | |
int_tp num = index; | |
for (i = num_axes - 1; i >= 0; --i) { | |
d_idx[i] = num % size[i]; | |
if (dilation[i] > 1) { | |
d_start[i] = | |
(d_idx[i] < ext_kernel_size[i]) ? | |
d_idx[i] % dilation[i] : (d_idx[i] - ext_kernel_size[i]) + 1; | |
d_end[i] = | |
(d_idx[i] >= pooled_size[i]) ? | |
(pooled_size[i] - 1) | |
- (pooled_size[i] - 1 - d_start[i]) % dilation[i] : | |
d_idx[i]; | |
} else { | |
d_start[i] = | |
(d_idx[i] + pad[i] < kernel_size[i]) ? | |
0 : (d_idx[i] + pad[i] - kernel_size[i]) / stride[i] + 1; | |
d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i] + 1), | |
(int_tp) (pooled_size[i])); | |
} | |
num /= size[i]; | |
offset *= pooled_size[i]; | |
d_iter[i] = d_start[i]; | |
if (d_start[i] > d_end[i]) { | |
bottom_diff[index] = 0; | |
return; | |
} | |
} | |
int_tp chan = num % channels; | |
num /= channels; | |
offset *= (num * channels + chan); | |
Dtype gradient = 0; | |
int_tp final_offset = 0; | |
int_tp im_offset = 0; | |
bool incremented; | |
do { | |
final_offset = offset; | |
im_offset = 0; | |
int_tp size_prod = 1; | |
int_tp pooled_size_prod = 1; | |
for (i = num_axes - 1; i >= 0; --i) { | |
final_offset += d_iter[i] * pooled_size_prod; | |
im_offset += d_idx[i] * size_prod; | |
size_prod *= size[i]; | |
pooled_size_prod *= pooled_size[i]; | |
} | |
if (use_mask) { | |
if (mask[final_offset] == im_offset) { | |
gradient += top_diff[final_offset]; | |
} | |
} else { | |
if (top_mask[final_offset] == im_offset) { | |
gradient += top_diff[final_offset]; | |
} | |
} | |
incremented = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
if (d_iter[i] > d_end[i] - dilation[i]) { | |
d_iter[i] = d_start[i]; | |
} else { | |
d_iter[i] += dilation[i]; | |
incremented = true; | |
break; | |
} | |
} | |
} while (incremented); | |
bottom_diff[index] = gradient; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads, | |
__global Dtype* bottom_data, | |
const int_tp num, | |
const int_tp channels, | |
const int_tp height, | |
const int_tp width, | |
const int_tp pooled_height, | |
const int_tp pooled_width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, | |
const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, | |
const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp dilation_h, | |
const int_tp dilation_w, | |
const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* top_data, | |
const int use_mask, | |
__global int_tp* mask, | |
__global Dtype* top_mask) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + ext_kernel_h, height); | |
int_tp wend = min(wstart + ext_kernel_w, width); | |
hstart = max(hstart, (int_tp) 0); | |
wstart = max(wstart, (int_tp) 0); | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
__global Dtype* bottom_data_ptr = bottom_data | |
+ (n * channels + c) * height * width; | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
if (bottom_data_ptr[h * width + w] > maxval) { | |
maxidx = h * width + w; | |
maxval = bottom_data_ptr[maxidx]; | |
} | |
} | |
} | |
top_data[index] = maxval; | |
if (use_mask == 1) { | |
mask[index] = maxidx; | |
} else { | |
top_mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(max_pool_backward_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* top_diff, const int use_mask, | |
__global const int_tp* mask, __global const Dtype* top_mask, | |
const int_tp num, const int_tp channels, const int_tp height, | |
const int_tp width, const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
__global const int_tp* mask_ptr = mask; | |
__global const Dtype* top_diff_ptr = top_diff; | |
// find out the local index | |
// find out the local offset | |
int_tp w = index % width; | |
int_tp h = (index / width) % height; | |
int_tp c = (index / width / height) % channels; | |
int_tp n = index / width / height / channels; | |
int_tp pooled_height_1 = pooled_height - 1; | |
int_tp pooled_width_1 = pooled_width - 1; | |
int_tp phstart = | |
(h < ext_kernel_h) ? h % dilation_h : (h - ext_kernel_h) + 1; | |
int_tp phend = | |
(h >= pooled_height) ? | |
pooled_height_1 - (pooled_height_1 - phstart) % dilation_h : h; | |
int_tp pwstart = | |
(w < ext_kernel_w) ? w % dilation_w : (w - ext_kernel_w) + 1; | |
int_tp pwend = | |
(w >= pooled_width) ? | |
pooled_width_1 - (pooled_width_1 - pwstart) % dilation_w : w; | |
Dtype gradient = 0; | |
int_tp offset = (n * channels + c) * pooled_height * pooled_width; | |
top_diff_ptr += offset; | |
if (use_mask == 1) { | |
mask_ptr += offset; | |
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) { | |
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) { | |
if (mask_ptr[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_ptr[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} else { | |
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) { | |
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) { | |
if (top_mask[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_ptr[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + ext_kernel_h, height + pad_h); | |
int_tp wend = min(wstart + ext_kernel_w, width + pad_w); | |
hstart = max(hstart, (int_tp)0); | |
wstart = max(wstart, (int_tp)0); | |
hend = min(hend, height); | |
wend = min(wend, width); | |
Dtype aveval = 0; | |
__global const Dtype* bottom_data_ptr = bottom_data; | |
bottom_data_ptr += (n * channels + c) * height * width; | |
int_tp pool_size = 0; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
aveval += bottom_data_ptr[h * width + w]; | |
++pool_size; | |
} | |
} | |
top_data[index] = aveval / pool_size; | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h; | |
int_tp hend = min(hstart + ext_kernel_h, height); | |
int_tp wstart = pw * stride_w; | |
int_tp wend = min(wstart + ext_kernel_w, width); | |
Dtype cumsum = 0.; | |
__global const Dtype* bottom_data_ptr = bottom_data; | |
bottom_data_ptr += (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
cumsum += bottom_data_ptr[h * width + w]; | |
} | |
} | |
float thres = rand_idx[index] * cumsum; | |
// Second pass: get value, and set index. | |
cumsum = 0; | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
cumsum += bottom_data_ptr[h * width + w]; | |
if (cumsum >= thres) { | |
rand_idx[index] = ((n * channels + c) * height + h) * width + w; | |
top_data[index] = bottom_data_ptr[h * width + w]; | |
h = hend; | |
w = wend; | |
} | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h; | |
int_tp hend = min(hstart + ext_kernel_h, height); | |
int_tp wstart = pw * stride_w; | |
int_tp wend = min(wstart + ext_kernel_w, width); | |
// We set cumsum to be 0 to avoid divide-by-zero problems | |
Dtype cumsum = FLT_MIN; | |
Dtype cumvalues = 0.; | |
__global const Dtype* bottom_data_ptr = bottom_data; | |
bottom_data_ptr += (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
cumsum += bottom_data_ptr[h * width + w]; | |
cumvalues += bottom_data_ptr[h * width + w] | |
* bottom_data_ptr[h * width + w]; | |
} | |
} | |
top_data[index] = cumvalues / cumsum; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads, | |
__global const Dtype* in_data, | |
const int forward, const int_tp num_slices, | |
const int_tp slice_size, | |
const int_tp bottom_slice_axis, | |
const int_tp top_slice_axis, | |
const int_tp offset_slice_axis, | |
__global Dtype* out_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp total_slice_size = slice_size * top_slice_axis; | |
const int_tp slice_num = index / total_slice_size; | |
const int_tp slice_index = index % total_slice_size; | |
const int_tp bottom_index = slice_index | |
+ (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; | |
if (forward == 1) { | |
out_data[index] = in_data[bottom_index]; | |
} else { | |
out_data[bottom_index] = in_data[index]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(softmax_loss_forward,Dtype)( | |
int_tp n, __global const Dtype* prob_data, __global const Dtype* label, | |
__global Dtype* loss, | |
const int_tp num, const int_tp dim, const int_tp spatial_dim, | |
const int has_ignore_label_, const int_tp ignore_label_, | |
__global Dtype* counts) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp n = index / spatial_dim; | |
const int_tp s = index % spatial_dim; | |
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); | |
if (has_ignore_label_ == 1 && label_value == ignore_label_) { | |
loss[index] = 0; | |
counts[index] = 0; | |
} else { | |
loss[index] = -log((Dtype)( | |
max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]), | |
(Dtype) FLT_MIN))); | |
counts[index] = 1; | |
} | |
} | |
} | |
__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top, | |
__global const Dtype* label, | |
__global Dtype* bottom_diff, | |
const int_tp num, | |
const int_tp dim, | |
const int_tp spatial_dim, | |
const int has_ignore_label_, | |
const int_tp ignore_label_, | |
__global Dtype* counts) { | |
const int_tp channels = dim / spatial_dim; | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
const int_tp n = index / spatial_dim; | |
const int_tp s = index % spatial_dim; | |
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); | |
if (has_ignore_label_ == 1 && label_value == ignore_label_) { | |
for (int_tp c = 0; c < channels; ++c) { | |
bottom_diff[n * dim + c * spatial_dim + s] = 0; | |
} | |
counts[index] = 0; | |
} else { | |
bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; | |
counts[index] = 1; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
__global Dtype* h2, | |
Dtype momentum, | |
Dtype delta, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi; | |
gi = gi * sqrt((h2[i] + delta) / (hi + delta)); | |
h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi; | |
g[i] = local_rate * gi; | |
} | |
} | |
__kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype delta, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype hi = h[i] = h[i] + gi * gi; | |
g[i] = local_rate * gi / (sqrt(hi) + delta); | |
} | |
} | |
__kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* m, | |
__global Dtype* v, | |
Dtype beta1, | |
Dtype beta2, | |
Dtype eps_hat, | |
Dtype corrected_local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1); | |
Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); | |
g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat); | |
} | |
} | |
__kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype momentum, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype hi = h[i]; | |
Dtype hi_new = h[i] = momentum * hi + local_rate * g[i]; | |
g[i] = (1 + momentum) * hi_new - momentum * hi; | |
} | |
} | |
__kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype rms_decay, | |
Dtype delta, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi; | |
g[i] = local_rate * g[i] / (sqrt(hi) + delta); | |
} | |
} | |
__kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype momentum, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
g[i] = h[i] = momentum * h[i] + local_rate * g[i]; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, | |
const int_tp tile_size, const int_tp num_tiles, | |
const int_tp bottom_tile_axis, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp d = index % tile_size; | |
const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis; | |
const int_tp n = index / tile_size / num_tiles / bottom_tile_axis; | |
const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d; | |
top_data[index] = bottom_data[bottom_index]; | |
} | |
} | |
__kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int_tp tile_size, | |
const int_tp num_tiles, | |
const int_tp bottom_tile_axis, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp d = index % tile_size; | |
const int_tp b = (index / tile_size) % bottom_tile_axis; | |
const int_tp n = index / tile_size / bottom_tile_axis; | |
bottom_diff[index] = 0; | |
int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d; | |
for (int_tp t = 0; t < num_tiles; ++t) { | |
bottom_diff[index] += top_diff[top_index]; | |
top_index += bottom_tile_axis * tile_size; | |
} | |
} | |
} | |
#ifdef DOUBLE_SUPPORT_AVAILABLE | |
#undef Dtype | |
#define Dtype double | |
#undef TYPE | |
#define TYPE TYPE_DOUBLE | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(relu_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out, | |
Dtype negative_slope) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope; | |
} | |
} | |
__kernel void TEMPLATE(relu_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff, | |
Dtype negative_slope) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = in_diff[index] | |
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * negative_slope); | |
} | |
} | |
__kernel void TEMPLATE(tanh_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = tanh(in[index]); | |
} | |
} | |
__kernel void TEMPLATE(tanh_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* out_data, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
Dtype tanhx = out_data[index]; | |
out_diff[index] = in_diff[index] * (1 - tanhx * tanhx); | |
} | |
} | |
__kernel void TEMPLATE(sigmoid_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = 1.0 / (1.0 + exp(-in[index])); | |
} | |
} | |
__kernel void TEMPLATE(sigmoid_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* out_data, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
const Dtype sigmoid_x = out_data[index]; | |
out_diff[index] = in_diff[index] * sigmoid_x * (1 - sigmoid_x); | |
} | |
} | |
__kernel void TEMPLATE(threshold,Dtype)(const int_tp n, const Dtype threshold, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] > threshold ? 1.0 : 0.0; | |
} | |
} | |
__kernel void TEMPLATE(prelu_forward,Dtype)(const int_tp n, const int_tp channels, | |
const int_tp dim, | |
__global const Dtype* in, | |
__global Dtype* out, | |
__global const Dtype* slope_data, | |
const int_tp div_factor) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
int_tp c = (index / dim) % channels / div_factor; | |
out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c]; | |
} | |
} | |
__kernel void TEMPLATE(prelu_backward,Dtype)(const int_tp n, const int_tp channels, | |
const int_tp dim, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff, | |
__global const Dtype* slope_data, | |
const int_tp div_factor) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
int_tp c = (index / dim) % channels / div_factor; | |
out_diff[index] = in_diff[index] | |
* ((in_data[index] > 0?1.0:0.0) + (in_data[index] <= 0?1.0:0.0) * slope_data[c]); | |
} | |
} | |
__kernel void TEMPLATE(prelu_param_backward,Dtype)(const int_tp n, const int_tp rows, | |
const int_tp rowPitch, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = in_diff[index] * in_data[index] * (in_data[index] <= 0?1.0:0.0); | |
for (int k = 1; k < rows; k++) { | |
out_diff[index] += in_diff[index + k * rowPitch] | |
* in_data[index + k * rowPitch] | |
* (in_data[index + k * rowPitch] <= 0?1.0:0.0); | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(gpu_set,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* y) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index] = alpha; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(br_forward,Dtype)(const int_tp count, const int_tp inner_dim, | |
__global const Dtype* in, | |
__global const Dtype* permut, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / (inner_dim); | |
int_tp in_n = (int_tp) (permut[n]); | |
out[index] = in[in_n * (inner_dim) + index % (inner_dim)]; | |
} | |
} | |
__kernel void TEMPLATE(br_backward,Dtype)(const int_tp count, const int_tp inner_dim, | |
__global const Dtype* in, | |
__global const Dtype* top_indexes, | |
__global const Dtype* begins, | |
__global const Dtype* counts, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / (inner_dim); | |
out[index] = 0; | |
int_tp lower = (int_tp) (begins[n]); | |
int_tp upper = lower + (int_tp) (counts[n]); | |
for (int_tp i = lower; i < upper; ++i) { | |
int_tp in_n = (int_tp) (top_indexes[i]); | |
out[index] += in[in_n * (inner_dim) + index % (inner_dim)]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(null_kernel,Dtype)(Dtype arg) { | |
Dtype out = arg; | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(bias_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const Dtype* bias, | |
const int_tp bias_dim, | |
const int_tp inner_dim, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp bias_index = (index / inner_dim) % bias_dim; | |
out[index] = in[index] + bias[bias_index]; | |
} | |
} | |
__kernel void TEMPLATE(scale_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const Dtype* scale, | |
const int_tp scale_dim, | |
const int_tp inner_dim, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp scale_index = (index / inner_dim) % scale_dim; | |
out[index] = in[index] * scale[scale_index]; | |
} | |
} | |
__kernel void TEMPLATE(scale_bias_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const Dtype* scale, | |
__global const Dtype* bias, | |
const int_tp scale_dim, | |
const int_tp inner_dim, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp scale_index = (index / inner_dim) % scale_dim; | |
out[index] = in[index] * scale[scale_index] + bias[scale_index]; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(bnll_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
if (in[index] > 0.0f) { | |
out[index] = in[index] + log((Dtype) (1.0 + exp(-in[index]))); | |
} else { | |
out[index] = log((Dtype) (1.0 + exp(in[index]))); | |
} | |
} | |
} | |
__kernel void TEMPLATE(bnll_backward,Dtype)(const int_tp n, | |
__global const Dtype* in_diff, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff) { | |
Dtype kBNLL_THRESHOLD = 50.; | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
Dtype expval = exp(min(in_data[index], kBNLL_THRESHOLD)); | |
out_diff[index] = in_diff[index] * expval / (expval + 1.); | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(kernel_channel_max,Dtype)(const int_tp num, const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* data, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < num * spatial_dim; index += | |
get_global_size(0)) { | |
int_tp n = index / spatial_dim; | |
int_tp s = index % spatial_dim; | |
float maxval = -FLT_MAX; | |
for (int_tp c = 0; c < channels; ++c) { | |
maxval = max((Dtype)(data[(n * channels + c) * spatial_dim + s]), (Dtype)maxval); | |
} | |
out[index] = maxval; | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_subtract,Dtype)(const int_tp count, const int_tp num, | |
const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* channel_max, | |
__global Dtype* data) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / channels / spatial_dim; | |
int_tp s = index % spatial_dim; | |
data[index] -= channel_max[n * spatial_dim + s]; | |
} | |
} | |
__kernel void TEMPLATE(kernel_exp,Dtype)(const int_tp count, __global const Dtype* data, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
out[index] = exp(data[index]); | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_sum,Dtype)(const int_tp num, const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* data, | |
__global Dtype* channel_sum) { | |
for (int_tp index = get_global_id(0); index < num * spatial_dim; index += | |
get_global_size(0)) { | |
int_tp n = index / spatial_dim; | |
int_tp s = index % spatial_dim; | |
Dtype sum = 0; | |
for (int_tp c = 0; c < channels; ++c) { | |
sum += data[(n * channels + c) * spatial_dim + s]; | |
} | |
channel_sum[index] = sum; | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_div,Dtype)(const int_tp count, const int_tp num, | |
const int_tp channels, const int_tp spatial_dim, | |
__global const Dtype* channel_sum, | |
__global Dtype* data) { | |
for (int_tp index = get_global_id(0); index < count; | |
index += get_global_size(0)) { | |
int_tp n = index / channels / spatial_dim; | |
int_tp s = index % spatial_dim; | |
data[index] /= channel_sum[n * spatial_dim + s]; | |
} | |
} | |
__kernel void TEMPLATE(kernel_channel_dot,Dtype)(const int_tp num, const int_tp channels, | |
const int_tp spatial_dim, | |
__global const Dtype* data_1, | |
__global const Dtype* data_2, | |
__global Dtype* channel_dot) { | |
for (int_tp index = get_global_id(0); index < num * spatial_dim; index += | |
get_global_size(0)) { | |
int_tp n = index / spatial_dim; | |
int_tp s = index % spatial_dim; | |
Dtype dot = 0; | |
for (int_tp c = 0; c < channels; ++c) { | |
dot += (data_1[(n * channels + c) * spatial_dim + s] | |
* data_2[(n * channels + c) * spatial_dim + s]); | |
} | |
channel_dot[index] = dot; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(concat,Dtype)(const int_tp nthreads, __global const Dtype* in_data, | |
const int forward, const int_tp num_concats, | |
const int_tp concat_size, | |
const int_tp top_concat_axis, | |
const int_tp bottom_concat_axis, | |
const int_tp offset_concat_axis, | |
__global Dtype* out_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp total_concat_size = concat_size * bottom_concat_axis; | |
const int_tp concat_num = index / total_concat_size; | |
const int_tp concat_index = index % total_concat_size; | |
const int_tp top_index = concat_index | |
+ (concat_num * top_concat_axis + offset_concat_axis) * concat_size; | |
if (forward == 1) { | |
out_data[top_index] = in_data[index]; | |
} else { | |
out_data[index] = in_data[top_index]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(cll_backward,Dtype)(const int_tp count, const int_tp channels, | |
const Dtype margin, const Dtype alpha, __global const Dtype* y, | |
__global const Dtype* diff, __global const Dtype* dist_sq, | |
__global Dtype *bottom_diff) { | |
for (int_tp i = get_global_id(0); i < count; | |
i += get_global_size(0)) { | |
int_tp n = i / channels; // the num index, to access y and dist_sq | |
if (trunc(y[n]) != 0.) { // similar pairs | |
bottom_diff[i] = alpha * diff[i]; | |
} else { // dissimilar pairs | |
Dtype mdist = 0.; | |
Dtype beta = 0.; | |
Dtype dist = sqrt(dist_sq[n]); | |
mdist = (margin - dist); | |
beta = -alpha * mdist / (dist + 1e-4) * diff[i]; | |
if (mdist > 0.) { | |
bottom_diff[i] = beta; | |
} else { | |
bottom_diff[i] = 0; | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(cll_backward_legacy,Dtype)(const int count, const int channels, | |
const Dtype margin, const Dtype alpha, __global Dtype* y, | |
__global Dtype* diff, __global Dtype* dist_sq, | |
__global Dtype* bottom_diff) { | |
for (int_tp i = get_global_id(0); i < count; | |
i += get_global_size(0)) { | |
int n = i / channels; // the num index, to access y and dist_sq | |
if (trunc(y[n]) != 0.) { // similar pairs | |
bottom_diff[i] = alpha * diff[i]; | |
} else { // dissimilar pairs | |
Dtype mdist = 0.; | |
Dtype beta = 0.; | |
mdist = (margin - dist_sq[n]); | |
beta = -alpha; | |
if (mdist > 0.) { | |
bottom_diff[i] = beta; | |
} else { | |
bottom_diff[i] = 0; | |
} | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(conv_layer_spatial_phony,Dtype)(Dtype arg) { | |
Dtype out = arg; | |
} | |
#define __CAT(x, y) x##y | |
#define CAT(x, y) __CAT(x, y) | |
#define LOOP1(VAR, STMT) (STMT); (VAR)++; | |
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++; | |
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) | |
#ifdef MULTI | |
__kernel void CFMulti(__global Dtype* image_data, int_tp image_offset, | |
__global Dtype* kernel_data, int_tp kernel_offset, | |
__global Dtype* bias,const int_tp bias_offset, | |
__global Dtype* convolved_image,const int_tp convolved_image_offset, | |
const ushort WIDTH, | |
const ushort HEIGHT, | |
const ushort OUTPUT_W, | |
const ushort OUTPUT_H) { | |
const int_tp outputX = get_global_id(0); | |
const int_tp outputY = get_global_id(1); | |
const int_tp kernelNum = get_global_id(2)*ZPAR; | |
if(outputX < OUTPUT_W && outputY < OUTPUT_H) | |
{ | |
Dtype sum[ZPAR]; | |
Dtype4 vectorSum[ZPAR]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
{ | |
sum[kern] = 0.0f; | |
vectorSum[kern] = (0.0f,0.0f,0.0f,0.0f); | |
} | |
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNEL_H*KERNEL_W*CHANNELS; | |
const int_tp biasIndex=bias_offset + kernelNum; | |
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W; | |
const int_tp imageSize = WIDTH*HEIGHT; | |
const int_tp float4Reads = KERNEL_W / 4; | |
const int_tp floatReads = KERNEL_W % 4; | |
Dtype4 imageCache; | |
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset)); | |
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); | |
for(int_tp c = 0; c < CHANNELS; c++) | |
{ | |
for(int_tp y = 0; y < KERNEL_H; y++) | |
{ | |
for(int_tp x=0; x< float4Reads; x++) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[x]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
{ | |
vectorSum[kern] += imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[x]; | |
} | |
} | |
if(floatReads == 1) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
vectorSum[kern].s0 += ( imageCache * ( (__global Dtype4*) &(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]) )[float4Reads] ).s0; | |
} | |
else if(floatReads == 2) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
vectorSum[kern].s01 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s01; | |
} | |
else if(floatReads == 3) | |
{ | |
imageCache = ((__global Dtype4*)image_dataPtrFloat)[float4Reads]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
vectorSum[kern].s012 += (imageCache*((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNEL_H*KERNEL_W*CHANNELS]))[float4Reads]).s012; | |
} | |
image_dataPtrFloat += WIDTH; | |
kernel_dataPtrFloat += KERNEL_W; | |
} | |
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H; | |
} | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
sum[kern] = vectorSum[kern].x + vectorSum[kern].y + vectorSum[kern].z + vectorSum[kern].w; | |
if(APPLY_BIAS == 1) | |
{ | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
if(kernelNum+kern < OUTPUT_Z) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = | |
sum[kern] + bias[biasIndex +kern]; | |
} | |
else | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
if(kernelNum+kern < OUTPUT_Z) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX] = sum[kern]; | |
} | |
} | |
#endif | |
#ifdef MULTI_11 | |
__kernel void CFMulti_11_11_4(__global Dtype* image_data, int_tp image_offset, | |
__global Dtype* kernel_data, int_tp kernel_offset, | |
__global Dtype* bias,const int_tp bias_offset, | |
__global Dtype* convolved_image,const int_tp convolved_image_offset, | |
const ushort WIDTH, | |
const ushort HEIGHT, | |
const ushort OUTPUT_W, | |
const ushort OUTPUT_H) { | |
int_tp outputX = get_global_id(0)*XPAR; | |
int_tp outputY = get_global_id(1)*YPAR; | |
int_tp kernelNum = get_global_id(2)*ZPAR; | |
if(outputX < OUTPUT_W && outputY < OUTPUT_H) | |
{ | |
Dtype sum[XPAR*YPAR*ZPAR]; | |
for(int_tp kern =0; kern < XPAR*YPAR*ZPAR; kern++) | |
{ | |
sum[kern] = 0.0f; | |
} | |
int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS; | |
int_tp biasIndex=bias_offset + kernelNum; | |
int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W; | |
int_tp imageSize = WIDTH*HEIGHT; | |
int_tp index; | |
__global Dtype* image_dataPtrFloat = (image_data + (image_offset + local_image_offset)); | |
__global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); | |
Dtype16 imageCache; | |
Dtype8 imageCacheR; | |
Dtype8 kernelCache; | |
Dtype4 kernelCacheR; | |
for(int_tp c = 0; c < CHANNELS; c++) | |
{ | |
for(int_tp y = 0; y < 11; y++) | |
{ | |
imageCache = ((__global Dtype16*)image_dataPtrFloat)[0]; | |
imageCacheR =((__global Dtype8*)image_dataPtrFloat)[2]; | |
for(int_tp kern =0; kern < ZPAR; kern++) | |
{ | |
kernelCache = ((__global Dtype8*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0]; | |
kernelCacheR = ((__global Dtype4*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[2]; | |
index = kern*XPAR; | |
sum[index + 0] += dot(imageCache.S0123,kernelCache.S0123); | |
sum[index + 1] += dot(imageCache.S4567,kernelCache.S0123); | |
sum[index + 2] += dot(imageCache.S89AB,kernelCache.S0123); | |
sum[index + 3] += dot(imageCache.SCDEF,kernelCache.S0123); | |
sum[index + 0] += dot(imageCache.S4567,kernelCache.S4567); | |
sum[index + 1] += dot(imageCache.S89AB,kernelCache.S4567); | |
sum[index + 2] += dot(imageCache.SCDEF,kernelCache.S4567); | |
sum[index + 3] += dot(imageCacheR.S0123,kernelCache.S4567); | |
sum[index + 0] += dot(imageCache.S89A,kernelCacheR.S012); | |
sum[index + 1] += dot(imageCache.SCDE,kernelCacheR.S012); | |
sum[index + 2] += dot(imageCacheR.S012,kernelCacheR.S012); | |
sum[index + 3] += dot(imageCacheR.S456,kernelCacheR.S012); | |
} | |
image_dataPtrFloat += WIDTH; | |
kernel_dataPtrFloat += KERNEL_W; | |
} | |
image_dataPtrFloat += imageSize - WIDTH*KERNEL_H; | |
} | |
if(APPLY_BIAS == 1) | |
{ | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
{ | |
for(int_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = | |
sum[kern*XPAR + wi] + bias[biasIndex +kern]; | |
} | |
} | |
else | |
for(int_tp kern = 0; kern < ZPAR; kern++) | |
for(int_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + outputY*OUTPUT_W + outputX + wi] = sum[kern*XPAR + wi]; | |
} | |
} | |
#endif | |
#ifdef MULTI_GEN | |
__kernel void CFMulti_6(__global const Dtype* restrict image_data, const int_tp image_offset, | |
__global const Dtype* restrict kernel_data, const int_tp kernel_offset, | |
__global const Dtype* restrict bias,const int_tp bias_offset, | |
__global Dtype* restrict convolved_image,const int_tp convolved_image_offset, | |
const ushort WIDTH, | |
const ushort HEIGHT, | |
const ushort OUTPUT_W, | |
const ushort OUTPUT_H) { | |
const int_tp outputX = get_global_id(0)*XPAR; | |
const int_tp outputY = get_global_id(1)*YPAR; | |
const int_tp kernelNum = get_global_id(2)*ZPAR; | |
if(outputX < OUTPUT_W && outputY < OUTPUT_H) | |
{ | |
Dtype sum[XPAR*YPAR*ZPAR]; | |
for(uint_tp kern = 0; kern < XPAR*YPAR*ZPAR; kern++) | |
sum[kern] = 0.0f; | |
const int_tp currentKernelOffset = kernel_offset + kernelNum*KERNELSIZE*CHANNELS; | |
const int_tp biasIndex=bias_offset + kernelNum; | |
const int_tp local_image_offset = outputY*STRIDE_H*WIDTH + outputX*STRIDE_W; | |
const int_tp imageSize = WIDTH*HEIGHT; | |
int_tp index; | |
__global const Dtype* image_dataPtrFloat[2]; | |
image_dataPtrFloat[0] = (image_data + (image_offset + local_image_offset)); | |
image_dataPtrFloat[1] = image_dataPtrFloat[0]; | |
__global const Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); | |
DTImage imageCache[YPAR]; | |
DTKernel kernelCache; | |
Dtype4 temp; | |
for(uint_tp c = 0; c < CHANNELS; c++) | |
{ | |
imageCache[0] = ((__global DTImage*)image_dataPtrFloat[1])[0]; | |
for(uint_tp preload = 1; preload < YPAR; preload++) | |
{ | |
image_dataPtrFloat[1] += WIDTH; | |
imageCache[preload] = ((__global DTImage*)image_dataPtrFloat[1])[0]; | |
} | |
int_tp y =0; | |
LOOP(KERNEL_H, y, | |
{ | |
int_tp kern=0; | |
LOOP(ZPAR, kern, | |
{ | |
kernelCache = ((__global DTKernel*)&(kernel_dataPtrFloat[kern*KERNELSIZE*CHANNELS]))[0]; | |
index = kern*XPAR*YPAR; | |
for(uint_tp y_par = 0; y_par < YPAR; y_par++) | |
{ | |
temp = floatDotV4(imageCache[y_par],kernelCache); | |
sum[index + y_par*XPAR + 0] += temp.s0; | |
sum[index + y_par*XPAR + 1] += temp.s1; | |
sum[index + y_par*XPAR + 2] += temp.s2; | |
sum[index + y_par*XPAR + 3] += temp.s3; | |
} | |
}); | |
kernel_dataPtrFloat += KERNEL_W; | |
for(uint_tp rotateData = 0; rotateData < YPAR - 1; rotateData++) | |
imageCache[rotateData] = imageCache[rotateData + 1]; | |
image_dataPtrFloat[1] += WIDTH; | |
imageCache[YPAR - 1] = ((__global DTImage*)image_dataPtrFloat[1])[0]; | |
}); | |
image_dataPtrFloat[0] += imageSize; | |
image_dataPtrFloat[1] = image_dataPtrFloat[0]; | |
} | |
if(APPLY_BIAS == 1) | |
{ | |
for(uint_tp kern = 0; kern < ZPAR; kern++) | |
{ | |
for(uint_tp hi =0; hi < YPAR; hi++) | |
for(uint_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY +hi)*OUTPUT_W + outputX + wi] = | |
sum[kern*XPAR*YPAR + XPAR*hi + wi] + bias[biasIndex +kern]; | |
} | |
} | |
else | |
for(uint_tp kern = 0; kern < ZPAR; kern++) | |
for(uint_tp hi =0; hi < YPAR; hi++) | |
for(uint_tp wi =0; wi < XPAR; wi++) | |
if(kernelNum+kern < OUTPUT_Z && outputX + wi < OUTPUT_W && outputY + hi < OUTPUT_H) | |
convolved_image[convolved_image_offset + (kernelNum+kern)*OUTPUT_H*OUTPUT_W + (outputY + hi)*OUTPUT_W + outputX + wi] = sum[kern*XPAR*YPAR +XPAR*hi +wi]; | |
} | |
} | |
#endif | |
//Begin IDLF kernels below here | |
#ifdef IDLF | |
#define activation_function(x) (x) | |
#if 0 | |
#define _IW INPUT_WIDTH | |
#define _IH INPUT_HEIGHT | |
#define _OW OUTPUT_WIDTH | |
#define _OH OUTPUT_HEIGHT | |
#endif | |
#define _ID INPUT_DEPTH | |
#define _OD NUM_FILTERS | |
#define FILTER_DEPTH INPUT_DEPTH | |
#define NUM_INPUT INPUT_DEPTH | |
#define NUM_OUTPUT NUM_FILTERS | |
#define KERNEL FILTER_WIDTH | |
// convolution stride, same for x and y | |
#define K_STRIDE STRIDEX | |
#ifndef IWPAD | |
#define IWPAD 0 | |
#endif | |
#ifndef IHPAD | |
#define IHPAD 0 | |
#endif | |
#define OUT_BLOCK_SIZE (OUT_BLOCK_WIDTH*OUT_BLOCK_HEIGHT) | |
#ifndef MASTER_OUT_BLOCK_WIDTH | |
#define MASTER_OUT_BLOCK_WIDTH OUT_BLOCK_WIDTH | |
#endif | |
#ifndef MASTER_OUT_BLOCK_HEIGHT | |
#define MASTER_OUT_BLOCK_HEIGHT OUT_BLOCK_HEIGHT | |
#endif | |
// Each work-item computes a 4x6 region of one output map. | |
// Each work-group (which will be mapped to 1 SIMD16 EU thread) will compute 16 different feature maps, but each feature map is for the same 4x6 region of the imput image. | |
// NDRange: (_OW+pad)/ OUT_BLOCK_WIDTH, (_OH+pad)/OUT_BLOCK_HEIGHT, _OD/OUT_BLOCK_DEPTH | |
//#define SIMD_SIZE 16 | |
// NOTE: this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break. | |
#ifdef SIMD16 | |
#define TILE_X ((OUT_BLOCK_WIDTH - 1) * STRIDEX + KERNEL) | |
#define TILE_Y ((OUT_BLOCK_HEIGHT - 1) * STRIDEY + KERNEL) | |
#if (TILE_X % 4) != 0 | |
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) | |
kernel void | |
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs | |
__global float* inputs_base, | |
filter_qualifier float* weights_base, | |
__global float* biases_base, | |
__global float* outputs_base, | |
const ushort _IW, | |
const ushort _IH, | |
const ushort _OW, | |
const ushort _OH) | |
{ | |
__global float* outputs = outputs_base; | |
__global float* inputs = inputs_base; | |
filter_qualifier float* weights = weights_base; | |
__global float* biases = biases_base; | |
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column | |
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row | |
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth | |
uint_tp fmg = get_group_id(2); | |
uint_tp lid = get_local_id(2); | |
float in[IN_BUFFER_SIZE];// load 11x16 block of input data, really only need 11x15 for 4x6 outputs, but keep it simple. | |
//float out[24]; // 4x6 block of outputs that is SIMD_SIZE deep (along the Feature Map dimension). | |
float out[OUT_BLOCK_SIZE]; | |
uint_tp in_addr; | |
// find weights adress of given neuron (lid is index) | |
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid; | |
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) { | |
out[i]=0.0f; | |
} | |
uint_tp num_in_batch = fm / _OD; | |
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE; | |
for(int_tp kd = 0; kd < _ID; kd++) | |
{ | |
in_addr = input_batch_offset + (kd + INPUT_START_Z) * (_IH + IHPAD) * (_IW + IWPAD) + (or*K_STRIDE + INPUT_START_Y) * (_IW + IWPAD) + (oc*K_STRIDE + INPUT_START_X) + lid; | |
// read 11x16 input block into registers. | |
for(uint_tp reg = 0; reg < IN_BUFFER_SIZE; reg++) { | |
in[reg] = inputs[in_addr]; // read 16 elements | |
in_addr += (_IW + IWPAD);// move to next row down | |
} | |
// PREF could be 4 or 8, could not be other values. | |
#define WEIGHT_PREF 8 | |
union { | |
float w[WEIGHT_PREF]; | |
uint8 ui8; | |
} weight_buf; | |
int_tp w_idx=0; | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
uint_tp orig_weight_addr = weight_addr; | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; | |
int_tp kr = 0; // kr = Kernel Row | |
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop. | |
{ | |
int_tp kc = 0; // kc = Kernel Column | |
LOOP(KERNEL, kc, | |
{ | |
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) { | |
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) { | |
float input = intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc); | |
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); | |
} | |
} | |
// We assume KERNEL_W is equal to KERNEL_H here. | |
if ((w_idx + 1) % WEIGHT_PREF == 0 | |
#if KERNEL*KERNEL % 8 != 0 | |
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF)) | |
#endif | |
) { | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. | |
} | |
#if KERNEL*KERNEL % 8 == 0 | |
// need to do nothing | |
#else | |
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF))) | |
#if KERNEL*KERNEL % 8 == 1 | |
weight_buf.w[0] = weights[weight_addr]; | |
#elif KERNEL*KERNEL % 4 == 0 | |
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]); | |
#else | |
// should never be here if kernel_w equal to kernel_h. just in case. | |
#error unsupported kernel size. | |
#endif | |
#endif | |
++w_idx; | |
}); | |
}); | |
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE; | |
} | |
#ifdef IMAGE_AS_OUTPUT | |
// TODO: no ULT for that one yet! | |
uint_tp out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD)) * (_OW + OWPAD) * (_OH + OHPAD);// out_addr indexes into start of 16 feature maps. | |
#else | |
// we need this address calculation for outputs because we support views and batching | |
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD); | |
#endif | |
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on; | |
// we need this address calculation for biases because we support views and batching | |
float bias = biases[(fm) % _OD ]; | |
#ifndef WRITE_PADDED_VALUES | |
if(get_global_id(0) != (get_global_size(0)-1) && | |
get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
#endif | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
#ifndef WRITE_PADDED_VALUES | |
} else if ( get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
} | |
else if ( get_global_id(0) != (get_global_size(0)-1) ) | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(out_addr + r * (_OW + OWPAD) + c,num_in_batch),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
} | |
else | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
#ifdef IMAGE_AS_OUTPUT | |
write_imagef(outputs,(int2)(c,r*(_OW + OWPAD)),activation_function(bias + out[r * OUT_BLOCK_WIDTH + c])); | |
#else | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
#endif | |
} | |
} | |
} | |
#endif //#ifndef WRITE_PADDED_VALUES | |
} | |
#endif | |
#if TILE_X % 4 == 0 | |
#define TILE_Y_STRIDE (64 / TILE_X) | |
#define INVEC_NUM ((TILE_Y + TILE_Y_STRIDE - 1) / TILE_Y_STRIDE) | |
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) | |
kernel void | |
convolve_simd16( // __global float *inputs, __global float* weights, __global float* outputs | |
__global float* inputs_base, | |
filter_qualifier float* weights_base, | |
__global float* biases_base, | |
__global float* outputs_base, | |
const ushort _IW, | |
const ushort _IH, | |
const ushort _OW, | |
const ushort _OH) | |
{ | |
__global float* outputs = outputs_base; | |
__global float* inputs = inputs_base; | |
filter_qualifier float* weights = weights_base; | |
__global float* biases = biases_base; | |
uint_tp oc = get_global_id(0) * MASTER_OUT_BLOCK_WIDTH; // oc = Output Column | |
uint_tp or = get_global_id(1) * MASTER_OUT_BLOCK_HEIGHT;// or = Output Row | |
uint_tp fm = get_global_id(2);// fm = Feature Map = od = Output Depth | |
uint_tp fmg = get_group_id(2); | |
uint_tp lid = get_local_id(2); | |
float out[OUT_BLOCK_SIZE]; | |
uint_tp in_addr; | |
// find weights adress of given neuron (lid is index) | |
uint_tp weight_addr = (fmg % (_OD/SIMD_SIZE)) * INPUT_DEPTH * KERNEL * KERNEL * SIMD_SIZE + lid; | |
for(int_tp i=0;i<OUT_BLOCK_SIZE;i++) { | |
out[i]=0.0f; | |
} | |
uint_tp num_in_batch = ( fm ) / _OD; | |
uint_tp input_batch_offset = num_in_batch * (_IH + IHPAD) * (_IW + IWPAD) * TOTAL_INPUT_DEPTH_SIZE; | |
in_addr = input_batch_offset + INPUT_START_Z * (_IH + IHPAD) * (_IW + IWPAD) + (or*STRIDEY + INPUT_START_Y) * (_IW + IWPAD) + (oc*STRIDEX + INPUT_START_X) | |
+ ( lid / ( TILE_X / 4 ) ) * (_IW + IWPAD) * STRIDEY // y tile offset | |
+ ( lid % ( TILE_X / 4 ) ) * 4 * STRIDEX; // x tile offset | |
for(int_tp kd = 0; kd < _ID; kd++) | |
{ | |
union { | |
float4 in_vec[INVEC_NUM]; | |
float in_array[INVEC_NUM * 4]; | |
} in_buf; | |
uint_tp in_offset = in_addr; | |
int_tp reg = 0; | |
#if INVEC_NUM == 1 | |
LOOP(1, reg, | |
#elif INVEC_NUM == 2 | |
LOOP(2, reg, | |
#elif INVEC_NUM == 3 | |
LOOP(3, reg, | |
#elif INVEC_NUM == 4 | |
LOOP(4, reg, | |
#else | |
#error too large invec_num. | |
#endif | |
{ | |
in_buf.in_vec[reg] = *(global float4*)(inputs + in_offset); // read 16 elements | |
in_offset += (_IW + IWPAD) * TILE_Y_STRIDE; | |
}); | |
in_addr += (_IH + IHPAD) * (_IW + IWPAD); | |
// PREF could be 4 or 8, could not be other values. | |
#define WEIGHT_PREF 8 | |
union { | |
float w[WEIGHT_PREF]; | |
uint8 ui8; | |
} weight_buf; | |
int_tp w_idx=0; | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
uint_tp orig_weight_addr = weight_addr; | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; | |
#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4)) | |
int_tp kr = 0; // kr = Kernel Row | |
LOOP(KERNEL, kr,// LOOP is a macro that unrolls the loop. | |
{ | |
int_tp kc = 0; // kc = Kernel Column | |
LOOP(KERNEL, kc, | |
{ | |
for(int_tp br=0; br < OUT_BLOCK_HEIGHT; br++) { | |
for(int_tp bc=0; bc < OUT_BLOCK_WIDTH; bc++) { | |
float input = BLOCK_IN((br * STRIDEY + kr) * TILE_X + bc * STRIDEX + kc);//intel_sub_group_shuffle( in[br * K_STRIDE + kr], bc * K_STRIDE + kc); | |
out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]); | |
} | |
} | |
// We assume KERNEL_W is equal to KERNEL_H here. | |
if ((w_idx + 1) % WEIGHT_PREF == 0 | |
#if KERNEL*KERNEL % 8 != 0 | |
&& ((w_idx + 1) <= (KERNEL * KERNEL - WEIGHT_PREF)) | |
#endif | |
) { | |
weight_buf.ui8 = intel_sub_group_block_read8((__global uint *)&weights[weight_addr]); | |
weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details. | |
} | |
#if KERNEL*KERNEL % 8 == 0 | |
// need to do nothing | |
#else | |
else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL * KERNEL - WEIGHT_PREF))) | |
#if KERNEL*KERNEL % 8 == 1 | |
weight_buf.w[0] = weights[weight_addr]; | |
#elif KERNEL*KERNEL % 4 == 0 | |
weight_buf.ui8.s0123 = intel_sub_group_block_read4((__global uint *)&weights[weight_addr]); | |
#else | |
// should never be here if kernel_w equal to kernel_h. just in case. | |
#error unsupported kernel size. | |
#endif | |
#endif | |
++w_idx; | |
}); | |
}); | |
weight_addr = orig_weight_addr + KERNEL * KERNEL * SIMD_SIZE; | |
} | |
// we need this address calculation for outputs because we support views and batching | |
uint_tp out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + (fm % _OD) ) * (_OW + OWPAD) * (_OH + OHPAD); | |
out_addr += or * (_OW + OWPAD) + oc; // offset for the 4x3 block that this workitem is working on; | |
// we need this address calculation for biases because we support views and batching | |
float bias = biases[(fm) % _OD ]; | |
#ifndef WRITE_PADDED_VALUES | |
if(get_global_id(0) != (get_global_size(0)-1) && | |
get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
#endif | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
#ifndef WRITE_PADDED_VALUES | |
} else if ( get_global_id(1) != (get_global_size(1)-1) ) | |
{ | |
for(uint_tp r = 0; r < OUT_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
} | |
else if ( get_global_id(0) != (get_global_size(0)-1) ) | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < OUT_BLOCK_WIDTH; c++) { | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
} | |
else | |
{ | |
for(uint_tp r = 0; r < LAST_BLOCK_HEIGHT; r++) { | |
for(uint_tp c = 0; c < LAST_BLOCK_WIDTH; c++) { | |
outputs[out_addr + r * (_OW + OWPAD) + c] = activation_function(bias + out[r * OUT_BLOCK_WIDTH + c]); | |
} | |
} | |
} | |
#endif //#ifndef WRITE_PADDED_VALUES | |
} | |
#endif // Stride > 2 | |
#endif | |
#endif | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(copyImage, Dtype) | |
(__global Dtype* image_data, | |
int_tp image_offset, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp adjustedHeight, const int_tp adjustedWidth, | |
const int_tp pad_h, const int_tp pad_w, | |
__global Dtype* output_image, | |
const int_tp output_offset, | |
const int_tp batch_size) { | |
uint_tp sX = get_global_id(0); | |
uint_tp sY = get_global_id(1); | |
uint_tp sZ = get_global_id(2); | |
int_tp in_y = sY - pad_h; | |
int_tp in_x = sX - pad_w; | |
int_tp batch_offset = 0; | |
int_tp adjusted_batch_offset = 0; | |
for(uint_tp batch_idx = 0; batch_idx < batch_size; batch_idx++) { | |
int_tp dst_offset = adjusted_batch_offset + output_offset + sZ*adjustedHeight*adjustedWidth + sY*adjustedWidth +sX; | |
int_tp src_offset = batch_offset + image_offset + sZ*height*width + in_y*width + in_x; | |
if((in_y >= 0 && in_y < height && in_x >= 0 && in_x < width)) | |
output_image[dst_offset] = image_data[src_offset]; | |
else | |
output_image[dst_offset] = 0; | |
batch_offset += height * width * channels; | |
adjusted_batch_offset += adjustedHeight * adjustedWidth * channels; | |
} | |
} | |
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype) | |
(__global Dtype* weightIn, | |
__global Dtype* weightOut, | |
const int_tp kernel_w, | |
const int_tp kernel_h, | |
const int_tp channels, | |
const int_tp outputs, | |
const int_tp swizzleFactor) { | |
uint_tp sX = get_global_id(0); | |
//Original location | |
//Output location | |
int_tp outputSublayer = channels / swizzleFactor; | |
int_tp outputSublayerIndex = channels % swizzleFactor; | |
int_tp filter = sX / (kernel_w*kernel_h*channels); | |
int_tp kernel_X = sX % kernel_w; | |
int_tp kernel_Y = (sX / kernel_w) % kernel_h; | |
int_tp kernel_C = (sX / (kernel_w * kernel_h)) % channels; | |
int_tp FP = filter / swizzleFactor; | |
int_tp F1 = filter % swizzleFactor; | |
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1] | |
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X]; | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(crop_copy, Dtype)(const int_tp n, const int_tp height, | |
const int_tp width, | |
const int_tp src_outer_stride, | |
const int_tp src_inner_stride, | |
const int_tp dest_outer_stride, | |
const int_tp dest_inner_stride, | |
__global const Dtype* src, | |
const int_tp src_off, | |
__global Dtype* dest, | |
const int_tp dest_off) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
int_tp src_start = index / height * src_outer_stride | |
+ index % height * src_inner_stride; | |
int_tp dest_start = index / height * dest_outer_stride | |
+ index % height * dest_inner_stride; | |
for (int_tp i = 0; i < width; ++i) { | |
dest[dest_off + dest_start + i] = src[src_off + src_start + i]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(dropout_forward,Dtype)(const int_tp n, | |
__global const Dtype* in, | |
__global const uint_tp* mask, | |
const uint_tp threshold, | |
const Dtype scale, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] * ((mask[index] > threshold)?1.0:0.0) * scale; | |
} | |
} | |
__kernel void TEMPLATE(dropout_backward,Dtype)( | |
const int_tp n, __global const Dtype* in_diff, | |
__global const uint_tp* mask, const uint_tp threshold, | |
const Dtype scale, | |
__global Dtype* out_diff) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = in_diff[index] * ((mask[index] > threshold)?1.0:0.0) * scale; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(eltwise_max_forward,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data_a, | |
__global const Dtype* bottom_data_b, const int_tp blob_idx, | |
__global Dtype* top_data, | |
__global int_tp* mask) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
if (bottom_data_a[index] > bottom_data_b[index]) { | |
// only update for very first bottom_data blob (blob_idx == 0) | |
if (blob_idx == 0) { | |
maxval = bottom_data_a[index]; | |
top_data[index] = maxval; | |
maxidx = blob_idx; | |
mask[index] = maxidx; | |
} | |
} else { | |
maxval = bottom_data_b[index]; | |
top_data[index] = maxval; | |
maxidx = blob_idx + 1; | |
mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(eltwise_max_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int_tp blob_idx, | |
__global const int_tp* mask, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
Dtype gradient = 0; | |
if (mask[index] == blob_idx) { | |
gradient += top_diff[index]; | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(elu_forward,Dtype)(const int n, __global const Dtype* in, | |
__global Dtype* out, | |
Dtype alpha) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out[index] = in[index] > 0 ? in[index] : alpha * (exp(in[index]) - 1.0); | |
} | |
} | |
__kernel void TEMPLATE(elu_backward,Dtype)(const int n, __global const Dtype* in_diff, | |
__global const Dtype* out_data, | |
__global const Dtype* in_data, | |
__global Dtype* out_diff, | |
Dtype alpha) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
out_diff[index] = | |
in_data[index] > 0 ? | |
in_diff[index] : in_diff[index] * (out_data[index] + alpha); | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(embed_forward,Dtype)(const int_tp nthreads, | |
__global const Dtype* bottom_data, | |
__global const Dtype* weight, | |
const int_tp M, const int_tp N, | |
const int_tp K, | |
__global Dtype* top_data) { | |
for (int_tp top_index = get_global_id(0); top_index < nthreads; | |
top_index += get_global_size(0)) { | |
const int_tp n = top_index / N; | |
const int_tp d = top_index % N; | |
const int_tp index = (int_tp)(bottom_data[n]); | |
const int_tp weight_index = index * N + d; | |
top_data[top_index] = weight[weight_index]; | |
} | |
} | |
// atomic_add from: http://suhorukov.blogspot.com/2011/12/opencl-11-atomic-operations-on-floating.html | |
#if (TYPE == TYPE_FLOAT) | |
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { | |
union { | |
uint_tp intVal; | |
Dtype floatVal; | |
} newVal; | |
union { | |
uint_tp intVal; | |
Dtype floatVal; | |
} prevVal; | |
do { | |
prevVal.floatVal = *source; | |
newVal.floatVal = prevVal.floatVal + operand; | |
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); | |
} | |
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, | |
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, | |
__global Dtype* weight_diff) { | |
for (int_tp top_index = get_global_id(0); top_index < nthreads; | |
top_index += get_global_size(0)) { | |
const int_tp n = top_index / N; | |
const int_tp d = top_index % N; | |
const int_tp index = (int_tp)(bottom_data[n]); | |
const int_tp weight_index = index * N + d; | |
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); | |
} | |
} | |
#endif | |
#if (TYPE == TYPE_DOUBLE) | |
#ifdef ATOMICS_64_AVAILABLE | |
inline void TEMPLATE(atomic_add,Dtype)(volatile __global Dtype *source, const Dtype operand) { | |
union { | |
unsigned long intVal; | |
Dtype floatVal; | |
} newVal; | |
union { | |
unsigned long intVal; | |
Dtype floatVal; | |
} prevVal; | |
do { | |
prevVal.floatVal = *source; | |
newVal.floatVal = prevVal.floatVal + operand; | |
} while (atom_cmpxchg((volatile __global unsigned long *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); | |
} | |
__kernel void TEMPLATE(embed_backward,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, | |
__global const Dtype* top_diff, const int_tp M, const int_tp N, const int_tp K, | |
__global Dtype* weight_diff) { | |
for (int_tp top_index = get_global_id(0); top_index < nthreads; | |
top_index += get_global_size(0)) { | |
const int_tp n = top_index / N; | |
const int_tp d = top_index % N; | |
const int_tp index = (int_tp)(bottom_data[n]); | |
const int_tp weight_index = index * N + d; | |
TEMPLATE(atomic_add,Dtype)((weight_diff + weight_index), *(top_diff + top_index)); | |
} | |
} | |
#endif | |
#endif | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(fillbuffer,Dtype)(const int_tp n, const char alpha, __global char* x, | |
const int_tp offx) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
x[index + offx] = alpha; | |
} | |
} | |
__kernel void TEMPLATE(fill,Dtype)(const int_tp n, const Dtype alpha, __global Dtype* x, | |
const int_tp offx) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
x[index + offx] = alpha; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(im2col,Dtype)(const int_tp n, | |
__global const Dtype* data_im, | |
const int_tp data_im_off, | |
const int_tp height, const int_tp width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp pad_h, | |
const int_tp pad_w, const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp dilation_h, | |
const int_tp dilation_w, | |
const int_tp height_col, | |
const int_tp width_col, | |
__global Dtype* data_col, | |
const int_tp data_col_off) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp h_index = index / width_col; | |
const int_tp h_col = h_index % height_col; | |
const int_tp w_col = index % width_col; | |
const int_tp c_im = h_index / height_col; | |
const int_tp c_col = c_im * kernel_h * kernel_w; | |
const int_tp h_offset = h_col * stride_h - pad_h; | |
const int_tp w_offset = w_col * stride_w - pad_w; | |
__global Dtype* data_col_ptr = data_col + data_col_off; | |
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; | |
__global const Dtype* data_im_ptr = data_im + data_im_off; | |
data_im_ptr += (c_im * height + h_offset) * width + w_offset; | |
for (int_tp i = 0; i < kernel_h; ++i) { | |
for (int_tp j = 0; j < kernel_w; ++j) { | |
int_tp h_im = h_offset + i * dilation_h; | |
int_tp w_im = w_offset + j * dilation_w; | |
*data_col_ptr = | |
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? | |
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; | |
data_col_ptr += height_col * width_col; | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(col2im,Dtype)(const int_tp n, | |
__global const Dtype* data_col, | |
const int_tp data_col_off, | |
const int_tp height, const int_tp width, | |
const int_tp channels, | |
const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp pad_h, | |
const int_tp pad_w, const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp dilation_h, | |
const int_tp dilation_w, | |
const int_tp height_col, | |
const int_tp width_col, | |
__global Dtype* data_im, | |
const int_tp data_im_off) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
Dtype val = 0; | |
const int_tp w_im = index % width + pad_w; | |
const int_tp h_im = (index / width) % height + pad_h; | |
const int_tp c_im = index / (width * height); | |
int_tp kernel_extent_w = (kernel_w - 1) * dilation_w + 1; | |
int_tp kernel_extent_h = (kernel_h - 1) * dilation_h + 1; | |
// compute the start and end of the output | |
const int_tp w_col_start = | |
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; | |
const int_tp w_col_end = min(w_im / stride_w + 1, width_col); | |
const int_tp h_col_start = | |
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; | |
const int_tp h_col_end = min(h_im / stride_h + 1, height_col); | |
// TODO: use LCM of stride and dilation to avoid unnecessary loops | |
for (int_tp h_col = h_col_start; h_col < h_col_end; h_col += 1) { | |
for (int_tp w_col = w_col_start; w_col < w_col_end; w_col += 1) { | |
int_tp h_k = (h_im - h_col * stride_h); | |
int_tp w_k = (w_im - w_col * stride_w); | |
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { | |
h_k /= dilation_h; | |
w_k /= dilation_w; | |
int_tp data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * | |
height_col + h_col) * width_col + w_col; | |
val += data_col[data_col_off + data_col_index]; | |
} | |
} | |
} | |
data_im[data_im_off + index] = val; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(im2col_nd, Dtype)(const int_tp n, const int_tp num_axes, | |
const int_tp channel_axis, | |
__global const Dtype* data_im, | |
const int_tp data_im_off, | |
__global const int_tp* im_shape, | |
__global const int_tp* col_shape, | |
__global const int_tp* kernel_shape, | |
__global const int_tp* pad, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global Dtype* data_col, | |
const int_tp data_col_off) { | |
int_tp d_temp[6]; | |
int_tp d_iter[6]; | |
int_tp i; | |
__global const int_tp* im_shape_ptr = im_shape + channel_axis; | |
__global const int_tp* col_shape_ptr = col_shape + channel_axis; | |
__local int_tp shared_dilation[6]; | |
__local int_tp shared_kernel_shape[6]; | |
__local int_tp shared_pad[6]; | |
__local int_tp shared_stride[6]; | |
__local int_tp shared_col_shape[6 + 1]; | |
__local int_tp shared_im_shape[6 + 1]; | |
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { | |
shared_dilation[li] = dilation[li]; | |
shared_kernel_shape[li] = kernel_shape[li]; | |
shared_pad[li] = pad[li]; | |
shared_stride[li] = stride[li]; | |
} | |
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { | |
shared_col_shape[li] = col_shape_ptr[li]; | |
shared_im_shape[li] = im_shape_ptr[li]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
// Initialize channel_in, computed in the loop below, with intermediate | |
// computations used to compute the spatial indices. | |
int_tp channel_in = index; | |
int_tp channel_out = 1; | |
for (i = num_axes - 1; i >= 0; --i) { | |
d_temp[i] = channel_in % shared_col_shape[i + 1]; | |
channel_in /= shared_col_shape[i + 1]; | |
channel_out *= shared_kernel_shape[i]; | |
} | |
channel_out *= channel_in; | |
int_tp data_col_inc = 1; | |
for (i = 0; i < num_axes; ++i) { | |
channel_out *= shared_col_shape[i + 1]; | |
channel_out += d_temp[i]; | |
d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i]; | |
channel_in *= shared_im_shape[i + 1]; | |
channel_in += d_temp[i]; | |
data_col_inc *= shared_col_shape[i + 1]; | |
d_iter[i] = 0; | |
} | |
__global Dtype* data_col_ptr = data_col + data_col_off + channel_out; | |
__global const Dtype* data_im_ptr = data_im + data_im_off + channel_in; | |
bool incremented; | |
do { | |
bool in_range = true; | |
for (i = 0; i < num_axes; ++i) { | |
const int_tp d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i]; | |
in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1]; | |
if (!in_range) { | |
break; | |
} | |
} | |
if (in_range) { | |
int_tp data_im_offset = d_iter[0] * shared_dilation[0]; | |
for (i = 1; i < num_axes; ++i) { | |
data_im_offset *= shared_im_shape[i + 1]; | |
data_im_offset += d_iter[i] * shared_dilation[i]; | |
} | |
*data_col_ptr = data_im_ptr[data_im_offset]; | |
} else { | |
*data_col_ptr = 0; | |
} | |
data_col_ptr += data_col_inc; | |
incremented = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
const int_tp d_max = shared_kernel_shape[i]; | |
if (d_iter[i] == d_max - 1) { | |
d_iter[i] = 0; | |
} else { // d_iter[i] < d_max - 1 | |
++d_iter[i]; | |
incremented = true; | |
break; | |
} | |
} // for (int_tp i = num_axes - 1; i >= 0; --i) | |
} while (incremented); // do | |
} | |
} | |
__kernel void TEMPLATE(col2im_nd, Dtype)(const int_tp n, const int_tp num_axes, | |
const int_tp channel_axis, | |
__global const Dtype* data_col, | |
const int_tp data_col_off, | |
__global const int_tp* im_shape, | |
__global const int_tp* col_shape, | |
__global const int_tp* kernel_shape, | |
__global const int_tp* pad, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global Dtype* data_im, | |
const int_tp data_im_off) { | |
int_tp d_im[6]; | |
int_tp d_col_iter[6]; | |
int_tp d_col_start[6]; | |
int_tp d_col_end[6]; | |
__global const int_tp* im_shape_ptr = im_shape + channel_axis; | |
__global const int_tp* col_shape_ptr = col_shape + channel_axis; | |
__local int_tp shared_dilation[6]; | |
__local int_tp shared_kernel_shape[6]; | |
__local int_tp shared_pad[6]; | |
__local int_tp shared_stride[6]; | |
__local int_tp shared_col_shape[6 + 1]; | |
__local int_tp shared_im_shape[6 + 1]; | |
for (int li = get_local_id(0); li < num_axes; li += get_local_size(0)) { | |
shared_dilation[li] = dilation[li]; | |
shared_kernel_shape[li] = kernel_shape[li]; | |
shared_pad[li] = pad[li]; | |
shared_stride[li] = stride[li]; | |
} | |
for (int li = get_local_id(0); li < num_axes + 1; li += get_local_size(0)) { | |
shared_col_shape[li] = col_shape_ptr[li]; | |
shared_im_shape[li] = im_shape_ptr[li]; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
// Initialize channel_in, computed in the loop below, with intermediate | |
// computations used to compute the spatial indices. | |
int_tp c_im = index; | |
// Calculate d_im (image dimensions). | |
for (int_tp i = num_axes - 1; i >= 0; --i) { | |
d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i]; | |
c_im /= shared_im_shape[i + 1]; | |
} | |
// Calculate col start/end indices. | |
bool done = false; | |
for (int_tp i = 0; i < num_axes; ++i) { | |
const int_tp kernel_extent = shared_dilation[i] | |
* (shared_kernel_shape[i] - 1) + 1; | |
d_col_start[i] = d_col_iter[i] = | |
(d_im[i] < kernel_extent) ? | |
0 : (d_im[i] - kernel_extent) / shared_stride[i] + 1; | |
d_col_end[i] = min(d_im[i] / shared_stride[i] + 1, | |
shared_col_shape[i + 1]); | |
if (d_col_start[i] >= d_col_end[i]) { | |
// Skip computation if the dimension is 0 at any spatial axis -- | |
// final val will be 0. | |
data_im[index] = 0; | |
done = true; | |
break; // for (int_tp i = 0; i < num_axes; ++i) | |
} | |
} | |
if (!done) { | |
// Loop over the col to compute the output val. | |
Dtype val = 0; | |
bool incremented = true; | |
bool skip = false; | |
do { | |
// Compute the final offset. | |
int_tp final_offset = 0; | |
int_tp kernel_shape_prod = 1; | |
int_tp kernel_index; | |
for (int_tp i = num_axes - 1; i >= 0; --i) { | |
kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i]; | |
if (kernel_index % shared_dilation[i]) { | |
skip = true; | |
break; | |
} else { | |
kernel_index /= shared_dilation[i]; | |
final_offset += kernel_index * kernel_shape_prod; | |
kernel_shape_prod *= shared_kernel_shape[i]; | |
} | |
} | |
if (!skip) { | |
final_offset += kernel_shape_prod * c_im; | |
for (int_tp i = 0; i < num_axes; ++i) { | |
final_offset *= shared_col_shape[i + 1]; | |
final_offset += d_col_iter[i]; | |
} | |
val += data_col[data_col_off + final_offset]; | |
} | |
skip = false; | |
incremented = false; | |
for (int_tp i = num_axes - 1; i >= 0; --i) { | |
const int_tp d_max = d_col_end[i]; | |
if (d_col_iter[i] == d_max - 1) { | |
d_col_iter[i] = d_col_start[i]; | |
} else { // d_col_iter[i] < d_max - 1 | |
++d_col_iter[i]; | |
incremented = true; | |
break; // for (int_tp i = num_axes - 1; i >= 0; --i) | |
} | |
} // for (int_tp i = num_axes - 1; i >= 0; --i) | |
} while (incremented); | |
data_im[data_im_off + index] = val; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(lrn_compute_output,Dtype)(const int_tp nthreads, | |
__global const Dtype* in, | |
__global const Dtype* scale, | |
const Dtype negative_beta, | |
__global Dtype* out) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
out[index] = in[index] * pow(scale[index], negative_beta); | |
} | |
} | |
__kernel void TEMPLATE(lrn_fill_scale,Dtype)(const int_tp nthreads, __global const Dtype* in, | |
const int_tp num, const int_tp channels, | |
const int_tp height, const int_tp width, const int_tp size, | |
const Dtype alpha_over_size, const Dtype k, | |
__global Dtype* const scale) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp n = index / width / height; | |
const int_tp offset = (n * channels * height + h) * width + w; | |
const int_tp step = height * width; | |
__global const Dtype* in_off = in + offset; | |
__global Dtype* scale_off = scale + offset; | |
int_tp head = 0; | |
const int_tp pre_pad = (size - 1) / 2; | |
const int_tp post_pad = size - pre_pad - 1; | |
Dtype accum_scale = 0; | |
// fill the scale at [n, :, h, w] | |
// accumulate values | |
while (head < post_pad && head < channels) { | |
accum_scale += in_off[head * step] * in_off[head * step]; | |
++head; | |
} | |
// both add and subtract | |
while (head < channels) { | |
accum_scale += in_off[head * step] * in_off[head * step]; | |
if (head - size >= 0) { | |
accum_scale -= in_off[(head - size) * step] | |
* in_off[(head - size) * step]; | |
} | |
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; | |
++head; | |
} | |
// subtract only | |
while (head < channels + post_pad) { | |
if (head - size >= 0) { | |
accum_scale -= in_off[(head - size) * step] | |
* in_off[(head - size) * step]; | |
} | |
scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; | |
++head; | |
} | |
} | |
} | |
__kernel void TEMPLATE(lrn_compute_diff,Dtype)(const int_tp nthreads, | |
__global const Dtype* bottom_data, | |
__global const Dtype* top_data, | |
__global const Dtype* scale, | |
__global const Dtype* top_diff, const int_tp num, | |
const int_tp channels, const int_tp height, | |
const int_tp width, const int_tp size, | |
const Dtype negative_beta, | |
const Dtype cache_ratio, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp n = index / width / height; | |
const int_tp offset = (n * channels * height + h) * width + w; | |
const int_tp step = height * width; | |
__global const Dtype* bottom_off = bottom_data + offset; | |
__global const Dtype* top_off = top_data + offset; | |
__global const Dtype* scale_off = scale + offset; | |
__global const Dtype* top_diff_off = top_diff + offset; | |
__global Dtype* bottom_diff_off = bottom_diff + offset; | |
int_tp head = 0; | |
const int_tp pre_pad = size - (size + 1) / 2; | |
const int_tp post_pad = size - pre_pad - 1; | |
Dtype accum_ratio = 0; | |
// accumulate values | |
while (head < post_pad && head < channels) { | |
accum_ratio += top_diff_off[head * step] * top_off[head * step] | |
/ scale_off[head * step]; | |
++head; | |
} | |
// both add and subtract | |
while (head < channels) { | |
accum_ratio += top_diff_off[head * step] * top_off[head * step] | |
/ scale_off[head * step]; | |
if (head - size >= 0) { | |
accum_ratio -= top_diff_off[(head - size) * step] | |
* top_off[(head - size) * step] / scale_off[(head - size) * step]; | |
} | |
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) | |
* step] * pow(scale_off[(head - post_pad) * step], negative_beta) | |
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; | |
++head; | |
} | |
// subtract only | |
while (head < channels + post_pad) { | |
if (head - size >= 0) { | |
accum_ratio -= top_diff_off[(head - size) * step] | |
* top_off[(head - size) * step] / scale_off[(head - size) * step]; | |
} | |
bottom_diff_off[(head - post_pad) * step] = top_diff_off[(head - post_pad) | |
* step] * pow(scale_off[(head - post_pad) * step], negative_beta) | |
- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; | |
++head; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
inline Dtype TEMPLATE(lstm_sigmoid,Dtype)(const Dtype x) { | |
return (Dtype)1 / ((Dtype)1 + exp(-x)); | |
} | |
inline Dtype TEMPLATE(lstm_tanh,Dtype)(const Dtype x) { | |
return (Dtype)2 * TEMPLATE(lstm_sigmoid,Dtype)((Dtype)2 * x) - (Dtype)1; | |
} | |
__kernel void TEMPLATE(lstm_acts_forward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* X, __global Dtype* X_acts) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp x_dim = 4 * dim; | |
const int_tp d = index % x_dim; | |
if (d < 3 * dim) { | |
X_acts[index] = TEMPLATE(lstm_sigmoid,Dtype)(X[index]); | |
} else { | |
X_acts[index] = TEMPLATE(lstm_tanh,Dtype)(X[index]); | |
} | |
} | |
} | |
__kernel void TEMPLATE(lstm_unit_forward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* cont, | |
__global Dtype* C, __global Dtype* H) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp n = index / dim; | |
const int_tp d = index % dim; | |
__global const Dtype* X_offset = X + 4 * dim * n; | |
const Dtype i = X_offset[d]; | |
const Dtype f = X_offset[1 * dim + d]; | |
const Dtype o = X_offset[2 * dim + d]; | |
const Dtype g = X_offset[3 * dim + d]; | |
const Dtype c_prev = C_prev[index]; | |
const Dtype c = cont[n] * f * c_prev + i * g; | |
C[index] = c; | |
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); | |
H[index] = o * tanh_c; | |
} | |
} | |
__kernel void TEMPLATE(lstm_unit_backward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* C_prev, __global const Dtype* X, __global const Dtype* C, __global const Dtype* H, | |
__global const Dtype* cont, __global const Dtype* C_diff, __global const Dtype* H_diff, | |
__global Dtype* C_prev_diff, __global Dtype* X_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp n = index / dim; | |
const int_tp d = index % dim; | |
__global const Dtype* X_offset = X + 4 * dim * n; | |
const Dtype i = X_offset[d]; | |
const Dtype f = X_offset[1 * dim + d]; | |
const Dtype o = X_offset[2 * dim + d]; | |
const Dtype g = X_offset[3 * dim + d]; | |
const Dtype c_prev = C_prev[index]; | |
const Dtype c = C[index]; | |
const Dtype tanh_c = TEMPLATE(lstm_tanh,Dtype)(c); | |
__global Dtype* c_prev_diff = C_prev_diff + index; | |
__global Dtype* X_diff_offset = X_diff + 4 * dim * n; | |
__global Dtype* i_diff = X_diff_offset + d; | |
__global Dtype* f_diff = X_diff_offset + 1 * dim + d; | |
__global Dtype* o_diff = X_diff_offset + 2 * dim + d; | |
__global Dtype* g_diff = X_diff_offset + 3 * dim + d; | |
const Dtype c_term_diff = | |
C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); | |
const Dtype cont_n = cont[n]; | |
*c_prev_diff = cont_n * c_term_diff * f; | |
*i_diff = c_term_diff * g; | |
*f_diff = cont_n * c_term_diff * c_prev; | |
*o_diff = H_diff[index] * tanh_c; | |
*g_diff = c_term_diff * i; | |
} | |
} | |
__kernel void TEMPLATE(lstm_acts_backward,Dtype)(const int_tp nthreads, const int_tp dim, | |
__global const Dtype* X_acts, __global const Dtype* X_acts_diff, __global Dtype* X_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp x_dim = 4 * dim; | |
const int_tp d = index % x_dim; | |
const Dtype X_act = X_acts[index]; | |
if (d < 3 * dim) { | |
X_diff[index] = X_acts_diff[index] * X_act * ((Dtype)1 - X_act); | |
} else { | |
X_diff[index] = X_acts_diff[index] * ((Dtype)1 - X_act * X_act); | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(mul,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, | |
__global Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = a[index + offa] * b[index + offb]; | |
} | |
} | |
__kernel void TEMPLATE(div,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, | |
__global Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = a[index + offa] / b[index + offb]; | |
} | |
} | |
__kernel void TEMPLATE(add_scalar,Dtype)(const int_tp N, const Dtype alpha, | |
__global Dtype* Y, | |
const int_tp offY) { | |
for (int_tp index = get_global_id(0); index < N; index += get_global_size(0)) { | |
Y[offY + index] += alpha; | |
} | |
} | |
__kernel void TEMPLATE(add,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global const Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = a[offa + index] + b[offb + index]; | |
} | |
} | |
__kernel void TEMPLATE(sub,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global const Dtype* b, | |
const int_tp offb, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = a[offa + index] - b[offb + index]; | |
} | |
} | |
__kernel void TEMPLATE(abs,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = fabs((Dtype)(a[offa + index])); | |
} | |
} | |
__kernel void TEMPLATE(exp,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = exp(a[offa + index]); | |
} | |
} | |
__kernel void TEMPLATE(log,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[offy + index] = log((Dtype)(a[offa + index])); | |
} | |
} | |
__kernel void TEMPLATE(powx,Dtype)(const int_tp n, __global const Dtype* a, | |
const int_tp offa, Dtype alpha, | |
__global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
if(alpha == 2.0) { | |
y[offy + index] = pow((Dtype)fabs(a[offa + index]), (Dtype)alpha); | |
} else { | |
y[offy + index] = pow((Dtype)a[offa + index], (Dtype)alpha); | |
} | |
} | |
} | |
__kernel void TEMPLATE(sign,Dtype)(const int_tp n, __global const Dtype* x, | |
const int_tp offx, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = (0.0 < x[index + offx]) | |
- (x[index + offx] < 0.0); | |
} | |
} | |
__kernel void TEMPLATE(sgnbit,Dtype)(const int_tp n, __global const Dtype* x, | |
const int_tp offx, __global Dtype* y, | |
const int_tp offy) { | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
y[index + offy] = signbit(x[index + offx]); | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(merge_copy_forward_stack, Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global const Dtype* bottom_a, | |
const int_tp forward_a, | |
__global const Dtype* bottom_b, | |
const int_tp forward_b, | |
__global Dtype* top, | |
const int_tp num, | |
const int_tp channels_a, | |
const int_tp channels_b, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
int_tp batch_id = index / ((channels_a + channels_b) * size_a); | |
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) | |
/ (channels_a * size_a)) % 2; | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
if (bottom_id == 0) { | |
int_tp channel_id = (index / size_a) % channels_a; | |
int_tp aidx = batch_id * channels_a + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
top[index] = (forward_a == 1) ? bottom_a[aidx] : 0; | |
} else { | |
int_tp channel_id = (index / size_a) % channels_b; | |
int_tp bidx = (batch_id * channels_b + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
top[index] = (forward_b == 1) ? bottom_b[bidx] : 0; | |
} | |
} | |
} | |
__kernel void TEMPLATE(merge_copy_backward_stack,Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global Dtype* bottom_a, | |
const int_tp backward_a, | |
__global Dtype* bottom_b, | |
const int_tp backward_b, | |
__global const Dtype* top, | |
const int_tp num, | |
const int_tp channels_a, | |
const int_tp channels_b, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp batch_id = index / ((channels_a + channels_b) * size_a); | |
int_tp bottom_id = ((index - batch_id * (channels_a + channels_b) * size_a) | |
/ (channels_a * size_a)) % 2; | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
if (bottom_id == 0) { | |
int_tp channel_id = (index / size_a) % channels_a; | |
int_tp aidx = batch_id * channels_a + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
bottom_a[aidx] = (backward_a == 1) ? top[index] : 0; | |
} else { | |
int_tp channel_id = (index / size_a) % channels_b; | |
int_tp bidx = (batch_id * channels_b + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
bottom_b[bidx] = (backward_b == 1) ? top[index] : 0; | |
} | |
} | |
} | |
__kernel void TEMPLATE(merge_copy_forward_add, Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global const Dtype* bottom_a, | |
const int_tp forward_a, | |
__global const Dtype* bottom_b, | |
const int_tp forward_b, | |
__global Dtype* top, | |
const int_tp num, | |
const int_tp channels, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp batch_id = index / (channels * size_a); | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
top[index] = 0; | |
int_tp channel_id = (index / size_a) % channels; | |
int_tp aidx = batch_id * channels + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
top[index] = forward_a ? top[index] + bottom_a[aidx] : top[index]; | |
int_tp bidx = (batch_id * channels + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
top[index] = forward_b ? top[index] + bottom_b[bidx] : top[index]; | |
} | |
} | |
__kernel void TEMPLATE(merge_copy_backward_add,Dtype)(const int_tp nthreads, | |
const int_tp dims, | |
__global Dtype* bottom_a, | |
const int_tp backward_a, | |
__global Dtype* bottom_b, | |
const int_tp backward_b, | |
__global const Dtype* top, | |
const int_tp num, | |
const int_tp channels, | |
__global const int_tp* shape_a, | |
__global const int_tp* shape_b) { | |
int_tp pad[6]; | |
int_tp tmp_idx[6]; | |
int_tp size_a = 1; | |
int_tp size_b = 1; | |
for (int_tp i = 0; i < dims; ++i) { | |
pad[i] = (shape_b[i] - shape_a[i]) / 2; | |
size_a *= shape_a[i]; | |
size_b *= shape_b[i]; | |
} | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp batch_id = index / (channels * size_a); | |
int_tp counter = index; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
tmp_idx[i] = counter % shape_a[i]; | |
counter /= shape_a[i]; | |
} | |
int_tp channel_id = (index / size_a) % channels; | |
int_tp aidx = batch_id * channels + channel_id; | |
for (int_tp i = 0; i < dims; ++i) { | |
aidx *= shape_a[i]; | |
aidx += tmp_idx[i]; | |
} | |
bottom_a[aidx] = backward_a ? top[index] : 0; | |
int_tp bidx = (batch_id * channels + channel_id) * size_b; | |
int_tp btemp = 1; | |
for (int_tp i = dims - 1; i >= 0; --i) { | |
bidx += btemp * (tmp_idx[i] + pad[i]); | |
btemp *= shape_b[i]; | |
} | |
bottom_b[bidx] = backward_b ? top[index] : 0; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(max_pool_forward,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* top_data, | |
const int use_mask, __global int_tp* mask, __global Dtype* top_mask) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
const int_tp hend = min(hstart + kernel_h, height); | |
const int_tp wend = min(wstart + kernel_w, width); | |
hstart = max(hstart, (int_tp)0); | |
wstart = max(wstart, (int_tp)0); | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
if (bottom_slice[h * width + w] > maxval) { | |
maxidx = h * width + w; | |
maxval = bottom_slice[maxidx]; | |
} | |
} | |
} | |
top_data[index] = maxval; | |
if (use_mask == 1) { | |
mask[index] = maxidx; | |
} else { | |
top_mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(ave_pool_forward,Dtype)( | |
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, const int_tp pad_h, | |
const int_tp pad_w, __global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
{ | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + kernel_h, height + pad_h); | |
int_tp wend = min(wstart + kernel_w, width + pad_w); | |
const int_tp pool_size = (hend - hstart) * (wend - wstart); | |
hstart = max(hstart, (int_tp)0); | |
wstart = max(wstart, (int_tp)0); | |
hend = min(hend, height); | |
wend = min(wend, width); | |
Dtype aveval = 0; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
aveval += bottom_slice[h * width + w]; | |
} | |
} | |
top_data[index] = aveval / pool_size; | |
} | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_train,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, | |
__global Dtype* rand_idx, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
const int_tp hstart = ph * stride_h; | |
const int_tp hend = min(hstart + kernel_h, height); | |
const int_tp wstart = pw * stride_w; | |
const int_tp wend = min(wstart + kernel_w, width); | |
Dtype cumsum = 0.; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
cumsum += bottom_slice[h * width + w]; | |
} | |
} | |
const float thres = rand_idx[index] * cumsum; | |
// Second pass: get value, and set index. | |
cumsum = 0; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
cumsum += bottom_slice[h * width + w]; | |
if (cumsum >= thres) { | |
rand_idx[index] = ((n * channels + c) * height + h) * width + w; | |
top_data[index] = bottom_slice[h * width + w]; | |
h = hend; | |
w = wend; | |
} | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)( | |
const int_tp nthreads, __global const Dtype* const bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, const int_tp kernel_h, | |
const int_tp kernel_w, const int_tp stride_h, const int_tp stride_w, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp pw = index % pooled_width; | |
const int_tp ph = (index / pooled_width) % pooled_height; | |
const int_tp c = (index / pooled_width / pooled_height) % channels; | |
const int_tp n = index / pooled_width / pooled_height / channels; | |
const int_tp hstart = ph * stride_h; | |
const int_tp hend = min(hstart + kernel_h, height); | |
const int_tp wstart = pw * stride_w; | |
const int_tp wend = min(wstart + kernel_w, width); | |
// We set cumsum to be 0 to avoid divide-by-zero problems | |
Dtype cumsum = FLT_MIN; | |
Dtype cumvalues = 0.; | |
__global const Dtype* bottom_slice = bottom_data | |
+ (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
cumsum += bottom_slice[h * width + w]; | |
cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; | |
} | |
} | |
top_data[index] = cumvalues / cumsum; | |
} | |
} | |
__kernel void TEMPLATE(max_pool_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int use_mask, | |
__global const int_tp* mask, | |
__global const Dtype* top_mask, | |
const int_tp num, | |
const int_tp channels, | |
const int_tp height, | |
const int_tp width, | |
const int_tp pooled_height, | |
const int_tp pooled_width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, | |
const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp c = (index / width / height) % channels; | |
const int_tp n = index / width / height / channels; | |
const int_tp phstart = | |
(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; | |
const int_tp phend = min((h + pad_h) / stride_h + 1, pooled_height); | |
const int_tp pwstart = | |
(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; | |
const int_tp pwend = min((w + pad_w) / stride_w + 1, pooled_width); | |
Dtype gradient = 0; | |
const int_tp offset = (n * channels + c) * pooled_height * pooled_width; | |
__global const Dtype* top_diff_slice = top_diff + offset; | |
if (use_mask == 1) { | |
__global const int_tp* mask_slice = mask + offset; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
if (mask_slice[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_slice[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} else { | |
__global const Dtype* top_mask_slice = top_mask + offset; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
if (top_mask_slice[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_slice[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
__kernel void TEMPLATE(ave_pool_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int_tp num, | |
const int_tp channels, | |
const int_tp height, | |
const int_tp width, | |
const int_tp pooled_height, | |
const int_tp pooled_width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, | |
const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
const int_tp w = index % width + pad_w; | |
const int_tp h = (index / width) % height + pad_h; | |
const int_tp c = (index / width / height) % channels; | |
const int_tp n = index / width / height / channels; | |
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; | |
const int_tp phend = min(h / stride_h + 1, pooled_height); | |
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; | |
const int_tp pwend = min(w / stride_w + 1, pooled_width); | |
Dtype gradient = 0.0; | |
__global const Dtype* const top_diff_slice = top_diff | |
+ (n * channels + c) * pooled_height * pooled_width; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
// figure out the pooling size | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + kernel_h, height + pad_h); | |
int_tp wend = min(wstart + kernel_w, width + pad_w); | |
int_tp pool_size = (hend - hstart) * (wend - wstart); | |
gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_backward,Dtype)( | |
const int_tp nthreads, __global const Dtype* rand_idx, | |
__global const Dtype* const top_diff, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp stride_h, | |
const int_tp stride_w, __global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
const int_tp w = index % width; | |
const int_tp h = (index / width) % height; | |
const int_tp c = (index / width / height) % channels; | |
const int_tp n = index / width / height / channels; | |
const int_tp phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; | |
const int_tp phend = min(h / stride_h + 1, pooled_height); | |
const int_tp pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; | |
const int_tp pwend = min(w / stride_w + 1, pooled_width); | |
Dtype gradient = 0.0; | |
__global const Dtype* rand_idx_slice = rand_idx | |
+ (n * channels + c) * pooled_height * pooled_width; | |
__global const Dtype* top_diff_slice = top_diff | |
+ (n * channels + c) * pooled_height * pooled_width; | |
for (int_tp ph = phstart; ph < phend; ++ph) { | |
for (int_tp pw = pwstart; pw < pwend; ++pw) { | |
gradient += top_diff_slice[ph * pooled_width + pw] | |
* (index == (int_tp) (rand_idx_slice[ph * pooled_width + pw])?1.0:0.0); | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(max_pool_forward_nd, Dtype)(const int_tp n, | |
const int_tp num_axes, | |
__global const Dtype* bottom_data, | |
const int_tp channels, | |
__global const int_tp* size, | |
__global const int_tp* pooled_size, | |
__global const int_tp* kernel_size, | |
__global const int_tp* ext_kernel_size, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global const int_tp* pad, | |
__global Dtype* top_data, | |
const int use_mask, | |
__global int_tp* mask, __global Dtype* top_mask) { | |
int_tp d_idx[6]; | |
int_tp d_start[6]; | |
int_tp d_end[6]; | |
int_tp d_iter[6]; | |
int_tp i; | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
int_tp offset = 1; | |
int_tp num = index; | |
bool do_continue = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
d_idx[i] = num % pooled_size[i]; | |
d_start[i] = d_idx[i] * stride[i] - pad[i]; | |
d_end[i] = min(d_start[i] + ext_kernel_size[i], size[i]); | |
d_start[i] = max(d_start[i], (int_tp)0); | |
num /= pooled_size[i]; | |
offset *= size[i]; | |
d_iter[i] = d_start[i]; | |
if (d_start[i] >= d_end[i]) { | |
top_data[index] = -FLT_MAX; | |
if (use_mask) { | |
mask[index] = -1; | |
} else { | |
top_mask[index] = -1; | |
} | |
do_continue = true; | |
} | |
} | |
if(do_continue) { | |
continue; | |
} | |
int_tp chan = num % channels; | |
num /= channels; | |
offset *= (num * channels + chan); | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
int_tp final_offset = 0; | |
bool incremented; | |
do { | |
final_offset = offset; | |
int_tp size_prod = 1; | |
for (i = num_axes - 1; i >= 0; --i) { | |
final_offset += d_iter[i] * size_prod; | |
size_prod *= size[i]; | |
} | |
if (bottom_data[final_offset] > maxval) { | |
maxidx = final_offset; | |
maxval = bottom_data[maxidx]; | |
} | |
incremented = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
if (d_iter[i] >= d_end[i] - dilation[i]) { | |
d_iter[i] = d_start[i]; | |
} else { | |
d_iter[i] += dilation[i]; | |
incremented = true; | |
break; | |
} | |
} | |
} while (incremented); | |
top_data[index] = maxval; | |
if (use_mask == 1) { | |
mask[index] = maxidx; | |
} else { | |
top_mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(max_pool_backward_nd, Dtype)(const int_tp n, | |
const int_tp num_axes, | |
__global const Dtype* top_diff, | |
const int use_mask, | |
__global const int_tp* mask, | |
__global const Dtype* top_mask, | |
const int_tp channels, | |
__global const int_tp* size, | |
__global const int_tp* pooled_size, | |
__global const int_tp* kernel_size, | |
__global const int_tp* ext_kernel_size, | |
__global const int_tp* stride, | |
__global const int_tp* dilation, | |
__global const int_tp* pad, | |
__global Dtype* bottom_diff) { | |
int_tp d_idx[6]; | |
int_tp d_start[6]; | |
int_tp d_end[6]; | |
int_tp d_iter[6]; | |
int_tp i; | |
for (int_tp index = get_global_id(0); index < n; index += get_global_size(0)) { | |
// find out the local index | |
// find out the local offset | |
int_tp offset = 1; | |
int_tp num = index; | |
for (i = num_axes - 1; i >= 0; --i) { | |
d_idx[i] = num % size[i]; | |
if (dilation[i] > 1) { | |
d_start[i] = | |
(d_idx[i] < ext_kernel_size[i]) ? | |
d_idx[i] % dilation[i] : (d_idx[i] - ext_kernel_size[i]) + 1; | |
d_end[i] = | |
(d_idx[i] >= pooled_size[i]) ? | |
(pooled_size[i] - 1) | |
- (pooled_size[i] - 1 - d_start[i]) % dilation[i] : | |
d_idx[i]; | |
} else { | |
d_start[i] = | |
(d_idx[i] + pad[i] < kernel_size[i]) ? | |
0 : (d_idx[i] + pad[i] - kernel_size[i]) / stride[i] + 1; | |
d_end[i] = min((int_tp) ((d_idx[i] + pad[i]) / stride[i] + 1), | |
(int_tp) (pooled_size[i])); | |
} | |
num /= size[i]; | |
offset *= pooled_size[i]; | |
d_iter[i] = d_start[i]; | |
if (d_start[i] > d_end[i]) { | |
bottom_diff[index] = 0; | |
return; | |
} | |
} | |
int_tp chan = num % channels; | |
num /= channels; | |
offset *= (num * channels + chan); | |
Dtype gradient = 0; | |
int_tp final_offset = 0; | |
int_tp im_offset = 0; | |
bool incremented; | |
do { | |
final_offset = offset; | |
im_offset = 0; | |
int_tp size_prod = 1; | |
int_tp pooled_size_prod = 1; | |
for (i = num_axes - 1; i >= 0; --i) { | |
final_offset += d_iter[i] * pooled_size_prod; | |
im_offset += d_idx[i] * size_prod; | |
size_prod *= size[i]; | |
pooled_size_prod *= pooled_size[i]; | |
} | |
if (use_mask) { | |
if (mask[final_offset] == im_offset) { | |
gradient += top_diff[final_offset]; | |
} | |
} else { | |
if (top_mask[final_offset] == im_offset) { | |
gradient += top_diff[final_offset]; | |
} | |
} | |
incremented = false; | |
for (i = num_axes - 1; i >= 0; --i) { | |
if (d_iter[i] > d_end[i] - dilation[i]) { | |
d_iter[i] = d_start[i]; | |
} else { | |
d_iter[i] += dilation[i]; | |
incremented = true; | |
break; | |
} | |
} | |
} while (incremented); | |
bottom_diff[index] = gradient; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(max_pool_forward_sk,Dtype)(const int_tp nthreads, | |
__global Dtype* bottom_data, | |
const int_tp num, | |
const int_tp channels, | |
const int_tp height, | |
const int_tp width, | |
const int_tp pooled_height, | |
const int_tp pooled_width, | |
const int_tp kernel_h, | |
const int_tp kernel_w, | |
const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, | |
const int_tp stride_h, | |
const int_tp stride_w, | |
const int_tp dilation_h, | |
const int_tp dilation_w, | |
const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* top_data, | |
const int use_mask, | |
__global int_tp* mask, | |
__global Dtype* top_mask) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + ext_kernel_h, height); | |
int_tp wend = min(wstart + ext_kernel_w, width); | |
hstart = max(hstart, (int_tp) 0); | |
wstart = max(wstart, (int_tp) 0); | |
Dtype maxval = -FLT_MAX; | |
int_tp maxidx = -1; | |
__global Dtype* bottom_data_ptr = bottom_data | |
+ (n * channels + c) * height * width; | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
if (bottom_data_ptr[h * width + w] > maxval) { | |
maxidx = h * width + w; | |
maxval = bottom_data_ptr[maxidx]; | |
} | |
} | |
} | |
top_data[index] = maxval; | |
if (use_mask == 1) { | |
mask[index] = maxidx; | |
} else { | |
top_mask[index] = maxidx; | |
} | |
} | |
} | |
__kernel void TEMPLATE(max_pool_backward_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* top_diff, const int use_mask, | |
__global const int_tp* mask, __global const Dtype* top_mask, | |
const int_tp num, const int_tp channels, const int_tp height, | |
const int_tp width, const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
__global const int_tp* mask_ptr = mask; | |
__global const Dtype* top_diff_ptr = top_diff; | |
// find out the local index | |
// find out the local offset | |
int_tp w = index % width; | |
int_tp h = (index / width) % height; | |
int_tp c = (index / width / height) % channels; | |
int_tp n = index / width / height / channels; | |
int_tp pooled_height_1 = pooled_height - 1; | |
int_tp pooled_width_1 = pooled_width - 1; | |
int_tp phstart = | |
(h < ext_kernel_h) ? h % dilation_h : (h - ext_kernel_h) + 1; | |
int_tp phend = | |
(h >= pooled_height) ? | |
pooled_height_1 - (pooled_height_1 - phstart) % dilation_h : h; | |
int_tp pwstart = | |
(w < ext_kernel_w) ? w % dilation_w : (w - ext_kernel_w) + 1; | |
int_tp pwend = | |
(w >= pooled_width) ? | |
pooled_width_1 - (pooled_width_1 - pwstart) % dilation_w : w; | |
Dtype gradient = 0; | |
int_tp offset = (n * channels + c) * pooled_height * pooled_width; | |
top_diff_ptr += offset; | |
if (use_mask == 1) { | |
mask_ptr += offset; | |
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) { | |
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) { | |
if (mask_ptr[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_ptr[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} else { | |
for (int_tp ph = phstart; ph <= phend; ph += dilation_h) { | |
for (int_tp pw = pwstart; pw <= pwend; pw += dilation_w) { | |
if (top_mask[ph * pooled_width + pw] == h * width + w) { | |
gradient += top_diff_ptr[ph * pooled_width + pw]; | |
} | |
} | |
} | |
} | |
bottom_diff[index] = gradient; | |
} | |
} | |
__kernel void TEMPLATE(ave_pool_forward_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, const int_tp pad_h, | |
const int_tp pad_w, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h - pad_h; | |
int_tp wstart = pw * stride_w - pad_w; | |
int_tp hend = min(hstart + ext_kernel_h, height + pad_h); | |
int_tp wend = min(wstart + ext_kernel_w, width + pad_w); | |
hstart = max(hstart, (int_tp)0); | |
wstart = max(wstart, (int_tp)0); | |
hend = min(hend, height); | |
wend = min(wend, width); | |
Dtype aveval = 0; | |
__global const Dtype* bottom_data_ptr = bottom_data; | |
bottom_data_ptr += (n * channels + c) * height * width; | |
int_tp pool_size = 0; | |
for (int_tp h = hstart; h < hend; ++h) { | |
for (int_tp w = wstart; w < wend; ++w) { | |
aveval += bottom_data_ptr[h * width + w]; | |
++pool_size; | |
} | |
} | |
top_data[index] = aveval / pool_size; | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_train_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, __global Dtype* rand_idx, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h; | |
int_tp hend = min(hstart + ext_kernel_h, height); | |
int_tp wstart = pw * stride_w; | |
int_tp wend = min(wstart + ext_kernel_w, width); | |
Dtype cumsum = 0.; | |
__global const Dtype* bottom_data_ptr = bottom_data; | |
bottom_data_ptr += (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
cumsum += bottom_data_ptr[h * width + w]; | |
} | |
} | |
float thres = rand_idx[index] * cumsum; | |
// Second pass: get value, and set index. | |
cumsum = 0; | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
cumsum += bottom_data_ptr[h * width + w]; | |
if (cumsum >= thres) { | |
rand_idx[index] = ((n * channels + c) * height + h) * width + w; | |
top_data[index] = bottom_data_ptr[h * width + w]; | |
h = hend; | |
w = wend; | |
} | |
} | |
} | |
} | |
} | |
__kernel void TEMPLATE(sto_pool_forward_test_sk,Dtype)( | |
const int_tp nthreads, __global const Dtype* bottom_data, const int_tp num, | |
const int_tp channels, const int_tp height, const int_tp width, | |
const int_tp pooled_height, const int_tp pooled_width, | |
const int_tp kernel_h, const int_tp kernel_w, const int_tp ext_kernel_h, | |
const int_tp ext_kernel_w, const int_tp stride_h, const int_tp stride_w, | |
const int_tp dilation_h, const int_tp dilation_w, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
int_tp pw = index % pooled_width; | |
int_tp ph = (index / pooled_width) % pooled_height; | |
int_tp c = (index / pooled_width / pooled_height) % channels; | |
int_tp n = index / pooled_width / pooled_height / channels; | |
int_tp hstart = ph * stride_h; | |
int_tp hend = min(hstart + ext_kernel_h, height); | |
int_tp wstart = pw * stride_w; | |
int_tp wend = min(wstart + ext_kernel_w, width); | |
// We set cumsum to be 0 to avoid divide-by-zero problems | |
Dtype cumsum = FLT_MIN; | |
Dtype cumvalues = 0.; | |
__global const Dtype* bottom_data_ptr = bottom_data; | |
bottom_data_ptr += (n * channels + c) * height * width; | |
// First pass: get sum | |
for (int_tp h = hstart; h < hend; h += dilation_h) { | |
for (int_tp w = wstart; w < wend; w += dilation_w) { | |
cumsum += bottom_data_ptr[h * width + w]; | |
cumvalues += bottom_data_ptr[h * width + w] | |
* bottom_data_ptr[h * width + w]; | |
} | |
} | |
top_data[index] = cumvalues / cumsum; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(slice,Dtype)(const int_tp nthreads, | |
__global const Dtype* in_data, | |
const int forward, const int_tp num_slices, | |
const int_tp slice_size, | |
const int_tp bottom_slice_axis, | |
const int_tp top_slice_axis, | |
const int_tp offset_slice_axis, | |
__global Dtype* out_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp total_slice_size = slice_size * top_slice_axis; | |
const int_tp slice_num = index / total_slice_size; | |
const int_tp slice_index = index % total_slice_size; | |
const int_tp bottom_index = slice_index | |
+ (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size; | |
if (forward == 1) { | |
out_data[index] = in_data[bottom_index]; | |
} else { | |
out_data[bottom_index] = in_data[index]; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(softmax_loss_forward,Dtype)( | |
int_tp n, __global const Dtype* prob_data, __global const Dtype* label, | |
__global Dtype* loss, | |
const int_tp num, const int_tp dim, const int_tp spatial_dim, | |
const int has_ignore_label_, const int_tp ignore_label_, | |
__global Dtype* counts) { | |
for (int_tp index = get_global_id(0); index < n; | |
index += get_global_size(0)) { | |
const int_tp n = index / spatial_dim; | |
const int_tp s = index % spatial_dim; | |
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); | |
if (has_ignore_label_ == 1 && label_value == ignore_label_) { | |
loss[index] = 0; | |
counts[index] = 0; | |
} else { | |
loss[index] = -log((Dtype)( | |
max((Dtype) (prob_data[n * dim + label_value * spatial_dim + s]), | |
(Dtype) FLT_MIN))); | |
counts[index] = 1; | |
} | |
} | |
} | |
__kernel void TEMPLATE(softmax_loss_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top, | |
__global const Dtype* label, | |
__global Dtype* bottom_diff, | |
const int_tp num, | |
const int_tp dim, | |
const int_tp spatial_dim, | |
const int has_ignore_label_, | |
const int_tp ignore_label_, | |
__global Dtype* counts) { | |
const int_tp channels = dim / spatial_dim; | |
for (int_tp index = get_global_id(0); index < nthreads; index += | |
get_global_size(0)) { | |
const int_tp n = index / spatial_dim; | |
const int_tp s = index % spatial_dim; | |
const int_tp label_value = (int_tp) (label[n * spatial_dim + s]); | |
if (has_ignore_label_ == 1 && label_value == ignore_label_) { | |
for (int_tp c = 0; c < channels; ++c) { | |
bottom_diff[n * dim + c * spatial_dim + s] = 0; | |
} | |
counts[index] = 0; | |
} else { | |
bottom_diff[n * dim + label_value * spatial_dim + s] -= 1; | |
counts[index] = 1; | |
} | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(ada_delta_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
__global Dtype* h2, | |
Dtype momentum, | |
Dtype delta, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype hi = h[i] = momentum * h[i] + (1.0 - momentum) * gi * gi; | |
gi = gi * sqrt((h2[i] + delta) / (hi + delta)); | |
h2[i] = momentum * h2[i] + (1.0 - momentum) * gi * gi; | |
g[i] = local_rate * gi; | |
} | |
} | |
__kernel void TEMPLATE(ada_grad_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype delta, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype hi = h[i] = h[i] + gi * gi; | |
g[i] = local_rate * gi / (sqrt(hi) + delta); | |
} | |
} | |
__kernel void TEMPLATE(adam_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* m, | |
__global Dtype* v, | |
Dtype beta1, | |
Dtype beta2, | |
Dtype eps_hat, | |
Dtype corrected_local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype mi = m[i] = m[i] * beta1 + gi * (1 - beta1); | |
Dtype vi = v[i] = v[i] * beta2 + gi * gi * (1 - beta2); | |
g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat); | |
} | |
} | |
__kernel void TEMPLATE(nesterov_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype momentum, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype hi = h[i]; | |
Dtype hi_new = h[i] = momentum * hi + local_rate * g[i]; | |
g[i] = (1 + momentum) * hi_new - momentum * hi; | |
} | |
} | |
__kernel void TEMPLATE(rms_prop_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype rms_decay, | |
Dtype delta, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
Dtype gi = g[i]; | |
Dtype hi = h[i] = rms_decay * h[i] + (1 - rms_decay) * gi * gi; | |
g[i] = local_rate * g[i] / (sqrt(hi) + delta); | |
} | |
} | |
__kernel void TEMPLATE(sgd_update,Dtype)(int_tp N, __global Dtype* g, | |
__global Dtype* h, | |
Dtype momentum, | |
Dtype local_rate) { | |
for (int_tp i = get_global_id(0); i < N; i += get_global_size(0)) { | |
g[i] = h[i] = momentum * h[i] + local_rate * g[i]; | |
} | |
} | |
#ifndef __OPENCL_VERSION__ | |
#include "header.cl" | |
#endif | |
__kernel void TEMPLATE(tile,Dtype)(const int_tp nthreads, __global const Dtype* bottom_data, | |
const int_tp tile_size, const int_tp num_tiles, | |
const int_tp bottom_tile_axis, | |
__global Dtype* top_data) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp d = index % tile_size; | |
const int_tp b = (index / tile_size / num_tiles) % bottom_tile_axis; | |
const int_tp n = index / tile_size / num_tiles / bottom_tile_axis; | |
const int_tp bottom_index = (n * bottom_tile_axis + b) * tile_size + d; | |
top_data[index] = bottom_data[bottom_index]; | |
} | |
} | |
__kernel void TEMPLATE(tile_backward,Dtype)(const int_tp nthreads, | |
__global const Dtype* top_diff, | |
const int_tp tile_size, | |
const int_tp num_tiles, | |
const int_tp bottom_tile_axis, | |
__global Dtype* bottom_diff) { | |
for (int_tp index = get_global_id(0); index < nthreads; | |
index += get_global_size(0)) { | |
const int_tp d = index % tile_size; | |
const int_tp b = (index / tile_size) % bottom_tile_axis; | |
const int_tp n = index / tile_size / bottom_tile_axis; | |
bottom_diff[index] = 0; | |
int_tp top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d; | |
for (int_tp t = 0; t < num_tiles; ++t) { | |
bottom_diff[index] += top_diff[top_index]; | |
top_index += bottom_tile_axis * tile_size; | |
} | |
} | |
} | |
#endif // DOUBLE_SUPPORT_AVAILABLE | |
Note: Google Test filter = *OpenCLKernelCompileTest* | |
[==========] Running 2 tests from 2 test cases. | |
[----------] Global test environment set-up. | |
[----------] 1 test from OpenCLKernelCompileTest/0, where TypeParam = float | |
[ RUN ] OpenCLKernelCompileTest/0.TestCompile | |
Kernel bundle: activation: OK | |
Kernel bundle: auxiliary: OK | |
Kernel bundle: batch_reindex: OK | |
Kernel bundle: benchmark: OK | |
Kernel bundle: bias: OK | |
Kernel bundle: bnll: OK | |
Kernel bundle: channel: OK | |
Kernel bundle: concat: OK | |
Kernel bundle: contrastive_loss: OK | |
Kernel bundle: conv_layer_spatial: OK | |
Kernel bundle: conv_spatial_helper: OK | |
Kernel bundle: crop: OK | |
Kernel bundle: dropout: OK | |
Kernel bundle: eltwise: OK | |
Kernel bundle: elu: OK | |
BUILD LOG: | |
(92:0) : error : atomic function "atomic_cmpxchg" not supported | |
Kernel bundle: fft: OK | |
Kernel bundle: fillbuffer: OK | |
Kernel bundle: im2col: OK | |
Kernel bundle: im2col_nd: OK | |
Kernel bundle: lrn: OK | |
Kernel bundle: lstm_unit: OK | |
Kernel bundle: math: OK | |
Kernel bundle: mergecrop: OK | |
Kernel bundle: pooling: OK | |
Kernel bundle: pooling_nd: OK | |
Kernel bundle: pooling_sk: OK | |
Kernel bundle: slice: OK | |
Kernel bundle: softmax_loss: OK | |
Kernel bundle: solvers: OK | |
Kernel bundle: tile: OK | |
src/caffe/test/test_ocl_kernel_compile.cpp:79: Failure | |
Value of: failure | |
Actual: true | |
Expected: false | |
[ FAILED ] OpenCLKernelCompileTest/0.TestCompile, where TypeParam = float (775 ms) | |
[----------] 1 test from OpenCLKernelCompileTest/0 (775 ms total) | |
[----------] 1 test from OpenCLKernelCompileTest/1, where TypeParam = double | |
[ RUN ] OpenCLKernelCompileTest/1.TestCompile | |
[ OK ] OpenCLKernelCompileTest/1.TestCompile (0 ms) | |
[----------] 1 test from OpenCLKernelCompileTest/1 (1 ms total) | |
[----------] Global test environment tear-down | |
[==========] 2 tests from 2 test cases ran. (777 ms total) | |
[ PASSED ] 1 test. | |
[ FAILED ] 1 test, listed below: | |
[ FAILED ] OpenCLKernelCompileTest/0.TestCompile, where TypeParam = float | |
1 FAILED TEST |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment