willtebbutt · February 6, 2022 14:39
diff --git a/gp_gpu.jl b/gp_gpu.jl
 using Revise

 using AbstractGPs
 using BenchmarkTools
 using CUDA
 using KernelFunctions
 using LinearAlgebra
 using Random

 using AbstractGPs: AbstractGP, FiniteGP, ZeroMean

 CUDA.allowscalar(false)

 x_cpu = ColVecs(randn(25, 1_000))
 x_gpu = ColVecs(cu(x_cpu.X))

 function KernelFunctions.kernelmatrix(::SEKernel, x::ColVecs{<:Real, <:CuMatrix{<:Real}})
    X = x.X
    X_ = sum(abs2, X; dims=1)
    D = X_ .+ X_' .- 2 .* X' * X
    return exp.(.-D ./ 2)
 end

 function benchmark_stuff()

    K_cpu = kernelmatrix(SEKernel(), x_cpu)
    K_gpu = collect(kernelmatrix(SEKernel(), x_gpu))
    @show maximum(abs.(K_cpu .- K_gpu))

    println("CPU")
    display(@benchmark kernelmatrix(SEKernel(), $x_cpu))
    println()

    println("GPU")
    display(@benchmark kernelmatrix(SEKernel(), $x_gpu))
    println()
 end

 function AbstractGPs._map_meanfunction(::ZeroMean{T}, x::ColVecs{<:Real, <:CuMatrix{<:Real}}) where {T}
    return CUDA.zeros(T, length(x))
 end

 # We could easily add a hook for the thing that generates ε into our current code, to make specialisation easier here.
 function AbstractGPs.rand(
    rng::AbstractRNG, f::FiniteGP{<:AbstractGP, <:ColVecs{<:Real, <:CuMatrix}}, N::Int
 )
    m, C_mat = mean_and_cov(f)
    C = cholesky(AbstractGPs._symmetric(C_mat))
    ε = CUDA.randn(promote_type(eltype(m), eltype(C)), length(m), N)
    return m .+ C.U' * ε
 end

 function sample_gp_cpu(x)
    f = GP(SEKernel())
    return rand(Xoshiro(1), f(x, 0.1))
 end

 function sample_gp_gpu(x)
    f = GP(ZeroMean{Float32}(), SEKernel())
    S = 0.1f0 * CUDA.ones(length(x))
    return vec(rand(Xoshiro(1), f(x, S), 1))
 end

 function benchmark_sampling()
    println("CPU")
    display(@benchmark sample_gp_cpu($x_cpu))
    println()
    println("GPU")
    display(@benchmark sample_gp_gpu($x_gpu))
    println()
 end

 # Looks like CUDA is missing this definition. Shouldn't be a problem to add.
 function LinearAlgebra.logdet(C::Cholesky{<:Real, <:CuMatrix{<:Real}})
    return 2 * sum(diag(log.(C.factors)))
 end

 function logpdf_gp_cpu(x, y)
    f = GP(SEKernel())
    return logpdf(f(x, 0.1), y)
 end

 function logpdf_gp_gpu(x, y)
    f = GP(ZeroMean{Float32}(), SEKernel())
    S = 0.1f0 * CUDA.ones(length(x))
    return logpdf(f(x, S), y)
 end

 function benchmark_logpdf()
    y_cpu = sample_gp_cpu(x_cpu)
    y_gpu = sample_gp_gpu(x_gpu)
    println("CPU")
    display(@benchmark logpdf_gp_cpu($x_cpu, $y_cpu))
    println()
    @show logpdf_gp_gpu(x_gpu, y_gpu)
    println("GPU")
    display(@benchmark logpdf_gp_gpu($x_gpu, $y_gpu))
    println()
 end

 Revise.track(@__FILE__)

                
 # julia> benchmark_sampling()
 # CPU
 # BenchmarkTools.Trial: 157 samples with 1 evaluation.
 #  Range (min … max):  28.447 ms … 41.251 ms  ┊ GC (min … max): 0.00% … 0.00%
 #  Time  (median):     29.841 ms              ┊ GC (median):    0.00%
 #  Time  (mean ± σ):   31.895 ms ±  3.375 ms  ┊ GC (mean ± σ):  5.69% ± 7.03%

 #   ▄█     █
 #   ██▄▁▁▁▃█▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅▄▅▃▁▃▄▃▁▁▅▁▄▃▁▁▁▃▃▆▃▃▆▃▃▁▃▃▁▁▃ ▃
 #   28.4 ms         Histogram: frequency by time        38.6 ms <

 #  Memory estimate: 30.56 MiB, allocs estimate: 24.

 # GPU
 # BenchmarkTools.Trial: 1963 samples with 1 evaluation.
 #  Range (min … max):  1.673 ms … 326.435 ms  ┊ GC (min … max): 0.00% … 2.11%
 #  Time  (median):     1.798 ms               ┊ GC (median):    0.00%
 #  Time  (mean ± σ):   2.630 ms ±  16.350 ms  ┊ GC (mean ± σ):  0.65% ± 0.10%

 #        ▂▁▁▃▂▃▄▄▃▃▂▃█▄▇▃▃▅▂▅▆▃▃█▃▄▄▃▁▁▁
 #   ▃▄▆▇████████████████████████████████▇▇▄▄▃▃▃▃▃▂▃▃▁▂▃▃▂▃▂▂▃▃▂ ▅
 #   1.67 ms         Histogram: frequency by time        2.02 ms <

 #  Memory estimate: 19.80 KiB, allocs estimate: 356.



 # julia> benchmark_logpdf()
 # abs(logpdf_gp_cpu(x_cpu, y_cpu) - logpdf_gp_gpu(x_gpu, y_gpu)) = 0.00013706540744351514
 # CPU
 # BenchmarkTools.Trial: 156 samples with 1 evaluation.
 #  Range (min … max):  24.175 ms … 46.282 ms  ┊ GC (min … max): 0.00% … 10.24%
 #  Time  (median):     29.421 ms              ┊ GC (median):    0.00%
 #  Time  (mean ± σ):   32.225 ms ±  4.875 ms  ┊ GC (mean ± σ):  5.79% ±  7.10%

 #              █▄ ▂▅            ▂  ▁   ▂    ▁  ▁    ▁
 #   ▅▁▁▅▁▁▁▁▁▁▁██▆██▅▁▁▁▁▁▁▁▅▆▅▁█▆▇█▁▆██▆▇▅▁█▆▁█▆▁▁▁█▁▁▁▁▁▁▁▁▁█ ▅
 #   24.2 ms      Histogram: log(frequency) by time      44.4 ms <

 #  Memory estimate: 30.55 MiB, allocs estimate: 15.

 # GPU
 # BenchmarkTools.Trial: 1657 samples with 1 evaluation.
 #  Range (min … max):  1.876 ms … 327.119 ms  ┊ GC (min … max): 0.00% … 2.06%
 #  Time  (median):     2.026 ms               ┊ GC (median):    0.00%
 #  Time  (mean ± σ):   3.012 ms ±  17.778 ms  ┊ GC (mean ± σ):  0.67% ± 0.11%

 #     ▁▁▂▆▅▅▄▆▆▇▄▇▆▄▆▇█▅▅▂
 #   ▃▆█████████████████████▅▄▃▃▃▃▃▃▂▃▃▂▁▂▁▂▂▁▁▁▂▂▁▂▁▁▁▁▁▁▂▁▂▂▁▂ ▄
 #   1.88 ms         Histogram: frequency by time        2.54 ms <

 #  Memory estimate: 22.97 KiB, allocs estimate: 415.
	using Revise

	using AbstractGPs
	using BenchmarkTools
	using CUDA
	using KernelFunctions
	using LinearAlgebra
	using Random

	using AbstractGPs: AbstractGP, FiniteGP, ZeroMean

	CUDA.allowscalar(false)

	x_cpu = ColVecs(randn(25, 1_000))
	x_gpu = ColVecs(cu(x_cpu.X))

	function KernelFunctions.kernelmatrix(::SEKernel, x::ColVecs{<:Real, <:CuMatrix{<:Real}})
	X = x.X
	X_ = sum(abs2, X; dims=1)
	D = X_ .+ X_' .- 2 .* X' * X
	return exp.(.-D ./ 2)
	end

	function benchmark_stuff()

	K_cpu = kernelmatrix(SEKernel(), x_cpu)
	K_gpu = collect(kernelmatrix(SEKernel(), x_gpu))
	@show maximum(abs.(K_cpu .- K_gpu))

	println("CPU")
	display(@benchmark kernelmatrix(SEKernel(), $x_cpu))
	println()

	println("GPU")
	display(@benchmark kernelmatrix(SEKernel(), $x_gpu))
	println()
	end

	function AbstractGPs._map_meanfunction(::ZeroMean{T}, x::ColVecs{<:Real, <:CuMatrix{<:Real}}) where {T}
	return CUDA.zeros(T, length(x))
	end

	# We could easily add a hook for the thing that generates ε into our current code, to make specialisation easier here.
	function AbstractGPs.rand(
	rng::AbstractRNG, f::FiniteGP{<:AbstractGP, <:ColVecs{<:Real, <:CuMatrix}}, N::Int
	)
	m, C_mat = mean_and_cov(f)
	C = cholesky(AbstractGPs._symmetric(C_mat))
	ε = CUDA.randn(promote_type(eltype(m), eltype(C)), length(m), N)
	return m .+ C.U' * ε
	end

	function sample_gp_cpu(x)
	f = GP(SEKernel())
	return rand(Xoshiro(1), f(x, 0.1))
	end

	function sample_gp_gpu(x)
	f = GP(ZeroMean{Float32}(), SEKernel())
	S = 0.1f0 * CUDA.ones(length(x))
	return vec(rand(Xoshiro(1), f(x, S), 1))
	end

	function benchmark_sampling()
	println("CPU")
	display(@benchmark sample_gp_cpu($x_cpu))
	println()
	println("GPU")
	display(@benchmark sample_gp_gpu($x_gpu))
	println()
	end

	# Looks like CUDA is missing this definition. Shouldn't be a problem to add.
	function LinearAlgebra.logdet(C::Cholesky{<:Real, <:CuMatrix{<:Real}})
	return 2 * sum(diag(log.(C.factors)))
	end

	function logpdf_gp_cpu(x, y)
	f = GP(SEKernel())
	return logpdf(f(x, 0.1), y)
	end

	function logpdf_gp_gpu(x, y)
	f = GP(ZeroMean{Float32}(), SEKernel())
	S = 0.1f0 * CUDA.ones(length(x))
	return logpdf(f(x, S), y)
	end

	function benchmark_logpdf()
	y_cpu = sample_gp_cpu(x_cpu)
	y_gpu = sample_gp_gpu(x_gpu)
	println("CPU")
	display(@benchmark logpdf_gp_cpu($x_cpu, $y_cpu))
	println()
	@show logpdf_gp_gpu(x_gpu, y_gpu)
	println("GPU")
	display(@benchmark logpdf_gp_gpu($x_gpu, $y_gpu))
	println()
	end

	Revise.track(@__FILE__)


	# julia> benchmark_sampling()
	# CPU
	# BenchmarkTools.Trial: 157 samples with 1 evaluation.
	# Range (min … max): 28.447 ms … 41.251 ms ┊ GC (min … max): 0.00% … 0.00%
	# Time (median): 29.841 ms ┊ GC (median): 0.00%
	# Time (mean ± σ): 31.895 ms ± 3.375 ms ┊ GC (mean ± σ): 5.69% ± 7.03%

	# ▄█ █
	# ██▄▁▁▁▃█▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅▄▅▃▁▃▄▃▁▁▅▁▄▃▁▁▁▃▃▆▃▃▆▃▃▁▃▃▁▁▃ ▃
	# 28.4 ms Histogram: frequency by time 38.6 ms <

	# Memory estimate: 30.56 MiB, allocs estimate: 24.

	# GPU
	# BenchmarkTools.Trial: 1963 samples with 1 evaluation.
	# Range (min … max): 1.673 ms … 326.435 ms ┊ GC (min … max): 0.00% … 2.11%
	# Time (median): 1.798 ms ┊ GC (median): 0.00%
	# Time (mean ± σ): 2.630 ms ± 16.350 ms ┊ GC (mean ± σ): 0.65% ± 0.10%

	# ▂▁▁▃▂▃▄▄▃▃▂▃█▄▇▃▃▅▂▅▆▃▃█▃▄▄▃▁▁▁
	# ▃▄▆▇████████████████████████████████▇▇▄▄▃▃▃▃▃▂▃▃▁▂▃▃▂▃▂▂▃▃▂ ▅
	# 1.67 ms Histogram: frequency by time 2.02 ms <

	# Memory estimate: 19.80 KiB, allocs estimate: 356.



	# julia> benchmark_logpdf()
	# abs(logpdf_gp_cpu(x_cpu, y_cpu) - logpdf_gp_gpu(x_gpu, y_gpu)) = 0.00013706540744351514
	# CPU
	# BenchmarkTools.Trial: 156 samples with 1 evaluation.
	# Range (min … max): 24.175 ms … 46.282 ms ┊ GC (min … max): 0.00% … 10.24%
	# Time (median): 29.421 ms ┊ GC (median): 0.00%
	# Time (mean ± σ): 32.225 ms ± 4.875 ms ┊ GC (mean ± σ): 5.79% ± 7.10%

	# █▄ ▂▅ ▂ ▁ ▂ ▁ ▁ ▁
	# ▅▁▁▅▁▁▁▁▁▁▁██▆██▅▁▁▁▁▁▁▁▅▆▅▁█▆▇█▁▆██▆▇▅▁█▆▁█▆▁▁▁█▁▁▁▁▁▁▁▁▁█ ▅
	# 24.2 ms Histogram: log(frequency) by time 44.4 ms <

	# Memory estimate: 30.55 MiB, allocs estimate: 15.

	# GPU
	# BenchmarkTools.Trial: 1657 samples with 1 evaluation.
	# Range (min … max): 1.876 ms … 327.119 ms ┊ GC (min … max): 0.00% … 2.06%
	# Time (median): 2.026 ms ┊ GC (median): 0.00%
	# Time (mean ± σ): 3.012 ms ± 17.778 ms ┊ GC (mean ± σ): 0.67% ± 0.11%

	# ▁▁▂▆▅▅▄▆▆▇▄▇▆▄▆▇█▅▅▂
	# ▃▆█████████████████████▅▄▃▃▃▃▃▃▂▃▃▂▁▂▁▂▂▁▁▁▂▂▁▂▁▁▁▁▁▁▂▁▂▂▁▂ ▄
	# 1.88 ms Histogram: frequency by time 2.54 ms <

	# Memory estimate: 22.97 KiB, allocs estimate: 415.