Created
June 17, 2020 10:06
-
-
Save AlfTetzlaff/fdee18829758f56a3b18c494d42c34c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# An implementation for GPU based bilinear upsampling including its gradient | |
# WARNING: Untested code ahead! | |
# The code is a translation from the following files: | |
# https://github.com/pytorch/pytorch/blob/master/caffe2/operators/upsample_op.cu | |
# https://github.com/pytorch/pytorch/blob/master/caffe2/core/common_gpu.h | |
# Open issues: | |
# 1) type stability? | |
# 2) licensing? | |
# 3) the array indexing has to be corrected | |
using CUDAnative: atomic_add! | |
using CuArrays | |
const CUDA_NUM_THREADS = 128 | |
const MAXIMUM_NUM_BLOCKS = 4096 | |
@inline function GET_BLOCKS(N::Integer) | |
# Use at least 1 block, since CUDA does not allow empty block | |
return max(min((N + CUDA_NUM_THREADS - 1) ÷ CUDA_NUM_THREADS, MAXIMUM_NUM_BLOCKS), 1) | |
end | |
@inline function idx( | |
n::Integer, | |
num_channels::Integer, | |
c::Integer, | |
height::Integer, | |
width::Integer, | |
y::Integer, | |
x::Integer) | |
return ((n * num_channels + c) * height + y) * width + x + 1 | |
end | |
function UpsampleBilinearKernel( | |
num_batch, | |
num_channels, | |
input_height, | |
input_width, | |
output_height, | |
output_width, | |
height_scale, | |
width_scale, | |
X, | |
Y) | |
out_size = output_height * output_width | |
# CUDA 1D kernel loop | |
for index in ((blockIdx().x-1) * blockDim().x + threadIdx().x) : (blockDim().x * gridDim().x) : out_size | |
indexTemp = index | |
out_x = indexTemp % output_width | |
indexTemp = indexTemp ÷ output_width | |
out_y = indexTemp % output_height | |
indexTemp = indexTemp ÷ output_height | |
indexTemp = indexTemp ÷ num_channels | |
rheight = output_height > 1 ? (input_height - 1f0) / (output_height - 1f0) : 0f0 | |
rwidth = output_width > 1 ? (input_width - 1f0) / (output_width - 1f0) : 0f0 | |
# Compute Y axis lambdas | |
h1r = rheight * out_y | |
h1::Int = round(h1r) | |
h1p = (h1 < input_height - 1) ? 1 : 0 | |
h1lambda = h1r - h1 | |
h0lambda = 1f0 - h1lambda | |
# Compute X axis lambdas | |
w1r = rwidth * out_x | |
w1::Int = round(w1r) | |
w1p = (w1 < input_width - 1) ? 1 : 0 | |
w1lambda = w1r - w1 | |
w0lambda = 1f0 - w1lambda | |
for n in 0:num_batch-1 | |
for c in 0:num_channels-1 | |
X0 = X[idx(n, num_channels, c, input_height, input_width, h1, w1)] | |
X1 = X[idx(n, num_channels, c, input_height, input_width, h1, w1 + w1p)] | |
X2 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1)] | |
X3 = X[idx(n, num_channels, c, input_height, input_width, h1 + h1p, w1 + w1p)] | |
Y[idx(n, num_channels, c, output_height, output_width, out_y, out_x)] = | |
h0lambda * (w0lambda * X0 + w1lambda * X1) + | |
h1lambda * (w0lambda * X2 + w1lambda * X3) | |
end # channels | |
end # batch | |
end # 1D kernel loop | |
return nothing | |
end | |
function upsample_bilinear(x, height_scale, width_scale) | |
n, c, h, w = Int32.(size(x)) | |
out_h = Int32(round(height_scale*h)) | |
out_w = Int32(round(width_scale*w)) | |
out_size = out_h*out_w | |
nblocks = GET_BLOCKS(out_size) | |
out = CuArray{Float32}(undef, n, c, out_h, out_w) | |
CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearKernel(n,c,h,w,out_h,out_w,height_scale,width_scale,x,out) | |
return out | |
end | |
# input is dY, output is dX | |
function UpsampleBilinearGradientKernel( | |
input_size, | |
num_channels, | |
input_height, | |
input_width, | |
output_height, | |
output_width, | |
height_scale, | |
width_scale, | |
dY, | |
dX) | |
#CUDA_1D_KERNEL_LOOP(index, input_size) { | |
for index in ((blockIdx().x - 1) * blockDim().x + threadIdx().x): blockDim().x * gridDim().x : input_size | |
indexTemp = index | |
in_x = indexTemp % input_width | |
indexTemp ÷= input_width | |
in_y = indexTemp % input_height | |
indexTemp ÷= input_height | |
c = indexTemp % num_channels | |
indexTemp ÷= num_channels | |
n = indexTemp | |
out_y = min(in_y / height_scale, output_height - 1) | |
out_x = min(in_x / width_scale, output_width - 1) | |
rheight = output_height > 1 ? (output_height - 1.f0) / (input_height - 1.f0) : 0.f0 | |
rwidth = output_width > 1 ? (output_width - 1.f0) / (input_width - 1.f0) : 0.f0 | |
# Compute Y axis lambdas | |
h1r = rheight * in_y | |
h1 = Int(round(h1r)) # check me | |
h1p = (h1 < output_height - 1) ? 1 : 0 | |
h1lambda = h1r - h1 | |
h0lambda = 1.f0 - h1lambda | |
# Compute X axis lambdas | |
w1r = rwidth * in_x | |
w1 = Int(round(w1r)) # check me | |
w1p = (w1 < output_width - 1) ? 1 : 0 | |
w1lambda = w1r - w1 | |
w0lambda = 1.f0 - w1lambda | |
#if __CUDA_ARCH__ >= 350 | |
# dYi = ldg(dY[index]) # ldg(pointer(dY[index])) ? | |
dYi = ldg(dY, index) # ldg(pointer(dY[index])) ? | |
#else | |
#dYi = dY[index]; | |
#endif | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1)), h0lambda * w0lambda * dYi) | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1, w1 + w1p)), h0lambda * w1lambda * dYi) | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1)), h1lambda * w0lambda * dYi) | |
atomic_add!( pointer(dX, idx(n, num_channels, c, output_height, output_width, h1 + h1p, w1 + w1p)), h1lambda * w1lambda * dYi) | |
end | |
return nothing | |
end | |
function upsample_bilinear_gradient(dy, x) | |
# input_size, | |
# num_channels, | |
# input_height, | |
# input_width, | |
# output_height, | |
# output_width, | |
# height_scale, | |
# width_scale, | |
# dY, | |
# dX | |
n, c, h, w = Int32.(size(dy)) | |
input_size = length(dy) | |
out_h = size(x)[3] | |
out_w = size(x)[4] | |
height_scale = Float32(out_h/h) | |
width_scale = Float32(out_w/w) | |
out_size = out_h * out_w | |
nblocks = GET_BLOCKS(out_size) | |
dx = CuArray{Float32}(undef, n, c, out_h, out_w) # zeros()? | |
CuArrays.@sync @cuda blocks=nblocks threads=CUDA_NUM_THREADS UpsampleBilinearGradientKernel(input_size, c, h, w, out_h, out_w, height_scale, width_scale, dy, dx) | |
return dx | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment