Created
April 27, 2020 13:48
-
-
Save jw3126/68b977aff8c579ef000ab32bf23834cd to your computer and use it in GitHub Desktop.
Julia CUDA 1d nn style large batch convolution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAnative, CuArrays | |
macro cushow(ex) | |
val = gensym("val") | |
s = string(ex) | |
quote | |
$val = $(esc(ex)) | |
CUDAnative.@cuprintln($(Expr(:string, s, " = ", val))) | |
$val | |
end | |
end | |
using ArgCheck | |
function conv1d_checks(out, kernel, inp) | |
dw = 1 # width | |
dc = 2 # channel | |
dbatch = 3 # batch | |
# out, inp: w, c, nbatch | |
# kernel: w, c_to, c_from | |
@argcheck ndims(out) == 3 | |
@argcheck ndims(inp) == 3 | |
@argcheck ndims(kernel) == 3 | |
@argcheck size(out, 3) == size(inp, 3) | |
@argcheck size(kernel, 2) == size(out, 2) | |
@argcheck size(kernel, 3) == size(inp, 2) | |
@argcheck size(out, 1) == size(inp, 1) - size(kernel, 1) + 1 | |
end | |
function conv1d_kernel!(out, ker, inp) | |
dw = 1 # width | |
dc = 2 # channel | |
db = 3 # batch | |
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x | |
stride = blockDim().x * gridDim().x | |
@inbounds for ib in index:stride:size(inp, db) | |
# @cushow ib | |
# @cushow threadIdx().x | |
# @cushow blockIdx().x | |
for iwout in 1:size(out, dw) | |
for icout in 1:size(out, dc) | |
acc = zero(eltype(out)) | |
for icin in 1:size(inp, dc) | |
for iwker in 1:size(ker, dw) | |
iwin = iwker + iwout - 1 | |
# TODO kernel format ker[iwker, icin, icout] | |
acc += ker[iwker, icout, icin] * inp[iwin, icin, ib] | |
end | |
end | |
out[iwout, icout, ib] = acc | |
end | |
end | |
end | |
return nothing | |
end | |
using Test | |
@testset "size(ker) = (w, 1, 1)" begin | |
ker = cu(reshape(Float32[1,2,3], (3,1,1))) | |
inp = cu(reshape(Float32[0,1,0,0], (4,1,1))) | |
expected = cu(reshape(Float32[2, 1], (2, 1, 1))) | |
out = similar(expected) | |
conv1d_checks(out, ker, inp) | |
@cuda conv1d_kernel!(out, ker, inp) # == out | |
@test out == expected | |
end | |
@testset "size(ker) = (1, *, *)" begin | |
dfrom = 3 | |
dto = 2 | |
v = randn(Float32, dfrom) | |
A = randn(Float32, dto, dfrom) | |
Av = A * v | |
inp = cu(reshape(v, 1, dfrom, 1)) | |
ker = cu(reshape(A, 1, dto, dfrom)) | |
out = similar(inp, eltype(inp), (1, dto, 1)) | |
conv1d_checks(out, ker, inp) | |
@cuda blocks=4 conv1d_kernel!(out, ker, inp) | |
@test out ≈ cu(reshape(Av, (1, dto, 1))) | |
end | |
@testset "identity kernel" begin | |
inp = cu(randn(Float32, 3, 2, 10)) | |
out = similar(inp) | |
ker = cu(reshape([1f0 0; 0 1], (1,2,2))) | |
conv1d_checks(out, ker, inp) | |
@cuda threads=2 blocks=2 conv1d_kernel!(out, ker, inp) | |
@test out ≈ inp | |
end | |
nb = 10^5 | |
ncin = 32 | |
ncout = 32 | |
nwin = 100 | |
nwker = 30 | |
nwout = nwin - nwker + 1 | |
inp = cu(randn(Float32, nwin, ncin, nb)) | |
ker = cu(randn(Float32, nwker, ncout, ncin)) | |
out = cu(randn(Float32, nwout, ncout, nb)) | |
conv1d_checks(out, ker, inp); | |
function conv1d_flopcount(out, ker, inp) | |
conv1d_checks(out, ker, inp) | |
nwin, ncin, nb = size(inp) | |
nwker, ncout_ker, ncin_ker = size(ker) | |
nwout, ncout, nb_out = size(out) | |
@assert nb_out == nb | |
@assert ncout_ker == ncout | |
@assert ncin_ker == ncin | |
matsize = (ncout, ncin * nwker) | |
flops_matmul = matsize[1] * matsize[2] | |
nwout * flops_matmul * nb | |
end | |
@show conv1d_flopcount(out, ker, inp) |> Float64 | |
@time begin | |
CuArrays.@sync begin | |
@cuda threads=32 blocks=256 conv1d_kernel!(out, ker, inp) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment