jw3126 · April 27, 2020 13:48
diff --git a/conv1d.jl b/conv1d.jl
 using CUDAnative, CuArrays

 macro cushow(ex)
    val = gensym("val")
    s = string(ex)
    quote
        $val = $(esc(ex))
        CUDAnative.@cuprintln($(Expr(:string, s, " = ", val)))
        $val
    end
 end

 using ArgCheck

 function conv1d_checks(out, kernel, inp)
    dw = 1 # width
    dc = 2 # channel
    dbatch = 3 # batch
    # out, inp: w, c, nbatch
    # kernel: w, c_to, c_from
    @argcheck ndims(out) == 3
    @argcheck ndims(inp) == 3
    @argcheck ndims(kernel) == 3
    @argcheck size(out, 3) == size(inp, 3)
    @argcheck size(kernel, 2) == size(out, 2)
    @argcheck size(kernel, 3) == size(inp, 2)
    @argcheck size(out, 1) == size(inp, 1) - size(kernel, 1) + 1
 end

 function conv1d_kernel!(out, ker, inp)
    dw = 1 # width
    dc = 2 # channel
    db = 3 # batch
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    @inbounds for ib in index:stride:size(inp, db)
        # @cushow ib
        # @cushow threadIdx().x
        # @cushow blockIdx().x
        for iwout in 1:size(out, dw)
            for icout in 1:size(out, dc)
                acc = zero(eltype(out))
                for icin in 1:size(inp, dc)
                    for iwker in 1:size(ker, dw)
                        iwin = iwker + iwout - 1
                        # TODO kernel format ker[iwker, icin, icout]
                        acc += ker[iwker, icout, icin] * inp[iwin, icin, ib]
                    end
                end
                out[iwout, icout, ib] = acc
            end
        end
    end
    return nothing
 end

 using Test

 @testset "size(ker) = (w, 1, 1)" begin
    ker      = cu(reshape(Float32[1,2,3], (3,1,1)))
    inp      = cu(reshape(Float32[0,1,0,0], (4,1,1)))
    expected = cu(reshape(Float32[2, 1], (2, 1, 1)))
    out = similar(expected)
    conv1d_checks(out, ker, inp)
    @cuda conv1d_kernel!(out, ker, inp) # == out
    @test out == expected
 end

 @testset "size(ker) = (1, *, *)" begin
    dfrom = 3
    dto = 2
    v = randn(Float32, dfrom)
    A = randn(Float32, dto, dfrom)
    Av = A * v
    
    inp = cu(reshape(v, 1, dfrom, 1))
    ker = cu(reshape(A, 1, dto, dfrom))
    out = similar(inp, eltype(inp), (1, dto, 1))
    conv1d_checks(out, ker, inp)
    @cuda blocks=4 conv1d_kernel!(out, ker, inp)
    @test out ≈ cu(reshape(Av, (1, dto, 1)))
 end

 @testset "identity kernel" begin
    inp = cu(randn(Float32, 3, 2, 10))
    out = similar(inp)
    ker = cu(reshape([1f0 0; 0 1], (1,2,2)))
    conv1d_checks(out, ker, inp)
    @cuda threads=2 blocks=2 conv1d_kernel!(out, ker, inp)
    @test out ≈ inp
 end

 nb = 10^5
 ncin = 32
 ncout = 32
 nwin = 100
 nwker = 30
 nwout = nwin - nwker + 1
 inp = cu(randn(Float32, nwin, ncin, nb))
 ker = cu(randn(Float32, nwker, ncout, ncin))
 out = cu(randn(Float32, nwout, ncout, nb))
 conv1d_checks(out, ker, inp);

 function conv1d_flopcount(out, ker, inp)
    conv1d_checks(out, ker, inp)
    nwin, ncin, nb = size(inp)
    nwker, ncout_ker, ncin_ker = size(ker)
    nwout, ncout, nb_out = size(out)
    @assert nb_out == nb
    @assert ncout_ker == ncout
    @assert ncin_ker == ncin
    matsize = (ncout, ncin * nwker)
    flops_matmul = matsize[1] * matsize[2]
    nwout * flops_matmul * nb
 end

 @show conv1d_flopcount(out, ker, inp) |> Float64

 @time begin
    CuArrays.@sync begin
        @cuda threads=32 blocks=256 conv1d_kernel!(out, ker, inp)
    end
 end
	using CUDAnative, CuArrays

	macro cushow(ex)
	val = gensym("val")
	s = string(ex)
	quote
	$val = $(esc(ex))
	CUDAnative.@cuprintln($(Expr(:string, s, " = ", val)))
	$val
	end
	end

	using ArgCheck

	function conv1d_checks(out, kernel, inp)
	dw = 1 # width
	dc = 2 # channel
	dbatch = 3 # batch
	# out, inp: w, c, nbatch
	# kernel: w, c_to, c_from
	@argcheck ndims(out) == 3
	@argcheck ndims(inp) == 3
	@argcheck ndims(kernel) == 3
	@argcheck size(out, 3) == size(inp, 3)
	@argcheck size(kernel, 2) == size(out, 2)
	@argcheck size(kernel, 3) == size(inp, 2)
	@argcheck size(out, 1) == size(inp, 1) - size(kernel, 1) + 1
	end

	function conv1d_kernel!(out, ker, inp)
	dw = 1 # width
	dc = 2 # channel
	db = 3 # batch
	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
	stride = blockDim().x * gridDim().x
	@inbounds for ib in index:stride:size(inp, db)
	# @cushow ib
	# @cushow threadIdx().x
	# @cushow blockIdx().x
	for iwout in 1:size(out, dw)
	for icout in 1:size(out, dc)
	acc = zero(eltype(out))
	for icin in 1:size(inp, dc)
	for iwker in 1:size(ker, dw)
	iwin = iwker + iwout - 1
	# TODO kernel format ker[iwker, icin, icout]
	acc += ker[iwker, icout, icin] * inp[iwin, icin, ib]
	end
	end
	out[iwout, icout, ib] = acc
	end
	end
	end
	return nothing
	end

	using Test

	@testset "size(ker) = (w, 1, 1)" begin
	ker = cu(reshape(Float32[1,2,3], (3,1,1)))
	inp = cu(reshape(Float32[0,1,0,0], (4,1,1)))
	expected = cu(reshape(Float32[2, 1], (2, 1, 1)))
	out = similar(expected)
	conv1d_checks(out, ker, inp)
	@cuda conv1d_kernel!(out, ker, inp) # == out
	@test out == expected
	end

	@testset "size(ker) = (1, , )" begin
	dfrom = 3
	dto = 2
	v = randn(Float32, dfrom)
	A = randn(Float32, dto, dfrom)
	Av = A * v

	inp = cu(reshape(v, 1, dfrom, 1))
	ker = cu(reshape(A, 1, dto, dfrom))
	out = similar(inp, eltype(inp), (1, dto, 1))
	conv1d_checks(out, ker, inp)
	@cuda blocks=4 conv1d_kernel!(out, ker, inp)
	@test out ≈ cu(reshape(Av, (1, dto, 1)))
	end

	@testset "identity kernel" begin
	inp = cu(randn(Float32, 3, 2, 10))
	out = similar(inp)
	ker = cu(reshape([1f0 0; 0 1], (1,2,2)))
	conv1d_checks(out, ker, inp)
	@cuda threads=2 blocks=2 conv1d_kernel!(out, ker, inp)
	@test out ≈ inp
	end

	nb = 10^5
	ncin = 32
	ncout = 32
	nwin = 100
	nwker = 30
	nwout = nwin - nwker + 1
	inp = cu(randn(Float32, nwin, ncin, nb))
	ker = cu(randn(Float32, nwker, ncout, ncin))
	out = cu(randn(Float32, nwout, ncout, nb))
	conv1d_checks(out, ker, inp);

	function conv1d_flopcount(out, ker, inp)
	conv1d_checks(out, ker, inp)
	nwin, ncin, nb = size(inp)
	nwker, ncout_ker, ncin_ker = size(ker)
	nwout, ncout, nb_out = size(out)
	@assert nb_out == nb
	@assert ncout_ker == ncout
	@assert ncin_ker == ncin
	matsize = (ncout, ncin * nwker)
	flops_matmul = matsize[1] * matsize[2]
	nwout * flops_matmul * nb
	end

	@show conv1d_flopcount(out, ker, inp) \|> Float64

	@time begin
	CuArrays.@sync begin
	@cuda threads=32 blocks=256 conv1d_kernel!(out, ker, inp)
	end
	end