Skip to content

Instantly share code, notes, and snippets.

@jw3126
Created April 27, 2020 13:48
Show Gist options
  • Save jw3126/68b977aff8c579ef000ab32bf23834cd to your computer and use it in GitHub Desktop.
Save jw3126/68b977aff8c579ef000ab32bf23834cd to your computer and use it in GitHub Desktop.
Julia CUDA 1d nn style large batch convolution
using CUDAnative, CuArrays
macro cushow(ex)
val = gensym("val")
s = string(ex)
quote
$val = $(esc(ex))
CUDAnative.@cuprintln($(Expr(:string, s, " = ", val)))
$val
end
end
using ArgCheck
function conv1d_checks(out, kernel, inp)
dw = 1 # width
dc = 2 # channel
dbatch = 3 # batch
# out, inp: w, c, nbatch
# kernel: w, c_to, c_from
@argcheck ndims(out) == 3
@argcheck ndims(inp) == 3
@argcheck ndims(kernel) == 3
@argcheck size(out, 3) == size(inp, 3)
@argcheck size(kernel, 2) == size(out, 2)
@argcheck size(kernel, 3) == size(inp, 2)
@argcheck size(out, 1) == size(inp, 1) - size(kernel, 1) + 1
end
function conv1d_kernel!(out, ker, inp)
dw = 1 # width
dc = 2 # channel
db = 3 # batch
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = blockDim().x * gridDim().x
@inbounds for ib in index:stride:size(inp, db)
# @cushow ib
# @cushow threadIdx().x
# @cushow blockIdx().x
for iwout in 1:size(out, dw)
for icout in 1:size(out, dc)
acc = zero(eltype(out))
for icin in 1:size(inp, dc)
for iwker in 1:size(ker, dw)
iwin = iwker + iwout - 1
# TODO kernel format ker[iwker, icin, icout]
acc += ker[iwker, icout, icin] * inp[iwin, icin, ib]
end
end
out[iwout, icout, ib] = acc
end
end
end
return nothing
end
using Test
@testset "size(ker) = (w, 1, 1)" begin
ker = cu(reshape(Float32[1,2,3], (3,1,1)))
inp = cu(reshape(Float32[0,1,0,0], (4,1,1)))
expected = cu(reshape(Float32[2, 1], (2, 1, 1)))
out = similar(expected)
conv1d_checks(out, ker, inp)
@cuda conv1d_kernel!(out, ker, inp) # == out
@test out == expected
end
@testset "size(ker) = (1, *, *)" begin
dfrom = 3
dto = 2
v = randn(Float32, dfrom)
A = randn(Float32, dto, dfrom)
Av = A * v
inp = cu(reshape(v, 1, dfrom, 1))
ker = cu(reshape(A, 1, dto, dfrom))
out = similar(inp, eltype(inp), (1, dto, 1))
conv1d_checks(out, ker, inp)
@cuda blocks=4 conv1d_kernel!(out, ker, inp)
@test out ≈ cu(reshape(Av, (1, dto, 1)))
end
@testset "identity kernel" begin
inp = cu(randn(Float32, 3, 2, 10))
out = similar(inp)
ker = cu(reshape([1f0 0; 0 1], (1,2,2)))
conv1d_checks(out, ker, inp)
@cuda threads=2 blocks=2 conv1d_kernel!(out, ker, inp)
@test out ≈ inp
end
nb = 10^5
ncin = 32
ncout = 32
nwin = 100
nwker = 30
nwout = nwin - nwker + 1
inp = cu(randn(Float32, nwin, ncin, nb))
ker = cu(randn(Float32, nwker, ncout, ncin))
out = cu(randn(Float32, nwout, ncout, nb))
conv1d_checks(out, ker, inp);
function conv1d_flopcount(out, ker, inp)
conv1d_checks(out, ker, inp)
nwin, ncin, nb = size(inp)
nwker, ncout_ker, ncin_ker = size(ker)
nwout, ncout, nb_out = size(out)
@assert nb_out == nb
@assert ncout_ker == ncout
@assert ncin_ker == ncin
matsize = (ncout, ncin * nwker)
flops_matmul = matsize[1] * matsize[2]
nwout * flops_matmul * nb
end
@show conv1d_flopcount(out, ker, inp) |> Float64
@time begin
CuArrays.@sync begin
@cuda threads=32 blocks=256 conv1d_kernel!(out, ker, inp)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment