Last active
February 6, 2022 14:39
-
-
Save willtebbutt/cc268e614d2b73156f5bc24e7408613d to your computer and use it in GitHub Desktop.
GPs on the GPU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Revise | |
using AbstractGPs | |
using BenchmarkTools | |
using CUDA | |
using KernelFunctions | |
using LinearAlgebra | |
using Random | |
using AbstractGPs: AbstractGP, FiniteGP, ZeroMean | |
CUDA.allowscalar(false) | |
x_cpu = ColVecs(randn(25, 1_000)) | |
x_gpu = ColVecs(cu(x_cpu.X)) | |
function KernelFunctions.kernelmatrix(::SEKernel, x::ColVecs{<:Real, <:CuMatrix{<:Real}}) | |
X = x.X | |
X_ = sum(abs2, X; dims=1) | |
D = X_ .+ X_' .- 2 .* X' * X | |
return exp.(.-D ./ 2) | |
end | |
function benchmark_stuff() | |
K_cpu = kernelmatrix(SEKernel(), x_cpu) | |
K_gpu = collect(kernelmatrix(SEKernel(), x_gpu)) | |
@show maximum(abs.(K_cpu .- K_gpu)) | |
println("CPU") | |
display(@benchmark kernelmatrix(SEKernel(), $x_cpu)) | |
println() | |
println("GPU") | |
display(@benchmark kernelmatrix(SEKernel(), $x_gpu)) | |
println() | |
end | |
function AbstractGPs._map_meanfunction(::ZeroMean{T}, x::ColVecs{<:Real, <:CuMatrix{<:Real}}) where {T} | |
return CUDA.zeros(T, length(x)) | |
end | |
# We could easily add a hook for the thing that generates ε into our current code, to make specialisation easier here. | |
function AbstractGPs.rand( | |
rng::AbstractRNG, f::FiniteGP{<:AbstractGP, <:ColVecs{<:Real, <:CuMatrix}}, N::Int | |
) | |
m, C_mat = mean_and_cov(f) | |
C = cholesky(AbstractGPs._symmetric(C_mat)) | |
ε = CUDA.randn(promote_type(eltype(m), eltype(C)), length(m), N) | |
return m .+ C.U' * ε | |
end | |
function sample_gp_cpu(x) | |
f = GP(SEKernel()) | |
return rand(Xoshiro(1), f(x, 0.1)) | |
end | |
function sample_gp_gpu(x) | |
f = GP(ZeroMean{Float32}(), SEKernel()) | |
S = 0.1f0 * CUDA.ones(length(x)) | |
return vec(rand(Xoshiro(1), f(x, S), 1)) | |
end | |
function benchmark_sampling() | |
println("CPU") | |
display(@benchmark sample_gp_cpu($x_cpu)) | |
println() | |
println("GPU") | |
display(@benchmark sample_gp_gpu($x_gpu)) | |
println() | |
end | |
# Looks like CUDA is missing this definition. Shouldn't be a problem to add. | |
function LinearAlgebra.logdet(C::Cholesky{<:Real, <:CuMatrix{<:Real}}) | |
return 2 * sum(diag(log.(C.factors))) | |
end | |
function logpdf_gp_cpu(x, y) | |
f = GP(SEKernel()) | |
return logpdf(f(x, 0.1), y) | |
end | |
function logpdf_gp_gpu(x, y) | |
f = GP(ZeroMean{Float32}(), SEKernel()) | |
S = 0.1f0 * CUDA.ones(length(x)) | |
return logpdf(f(x, S), y) | |
end | |
function benchmark_logpdf() | |
y_cpu = sample_gp_cpu(x_cpu) | |
y_gpu = sample_gp_gpu(x_gpu) | |
println("CPU") | |
display(@benchmark logpdf_gp_cpu($x_cpu, $y_cpu)) | |
println() | |
@show logpdf_gp_gpu(x_gpu, y_gpu) | |
println("GPU") | |
display(@benchmark logpdf_gp_gpu($x_gpu, $y_gpu)) | |
println() | |
end | |
Revise.track(@__FILE__) | |
# julia> benchmark_sampling() | |
# CPU | |
# BenchmarkTools.Trial: 157 samples with 1 evaluation. | |
# Range (min … max): 28.447 ms … 41.251 ms ┊ GC (min … max): 0.00% … 0.00% | |
# Time (median): 29.841 ms ┊ GC (median): 0.00% | |
# Time (mean ± σ): 31.895 ms ± 3.375 ms ┊ GC (mean ± σ): 5.69% ± 7.03% | |
# ▄█ █ | |
# ██▄▁▁▁▃█▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▅▄▅▃▁▃▄▃▁▁▅▁▄▃▁▁▁▃▃▆▃▃▆▃▃▁▃▃▁▁▃ ▃ | |
# 28.4 ms Histogram: frequency by time 38.6 ms < | |
# Memory estimate: 30.56 MiB, allocs estimate: 24. | |
# GPU | |
# BenchmarkTools.Trial: 1963 samples with 1 evaluation. | |
# Range (min … max): 1.673 ms … 326.435 ms ┊ GC (min … max): 0.00% … 2.11% | |
# Time (median): 1.798 ms ┊ GC (median): 0.00% | |
# Time (mean ± σ): 2.630 ms ± 16.350 ms ┊ GC (mean ± σ): 0.65% ± 0.10% | |
# ▂▁▁▃▂▃▄▄▃▃▂▃█▄▇▃▃▅▂▅▆▃▃█▃▄▄▃▁▁▁ | |
# ▃▄▆▇████████████████████████████████▇▇▄▄▃▃▃▃▃▂▃▃▁▂▃▃▂▃▂▂▃▃▂ ▅ | |
# 1.67 ms Histogram: frequency by time 2.02 ms < | |
# Memory estimate: 19.80 KiB, allocs estimate: 356. | |
# julia> benchmark_logpdf() | |
# abs(logpdf_gp_cpu(x_cpu, y_cpu) - logpdf_gp_gpu(x_gpu, y_gpu)) = 0.00013706540744351514 | |
# CPU | |
# BenchmarkTools.Trial: 156 samples with 1 evaluation. | |
# Range (min … max): 24.175 ms … 46.282 ms ┊ GC (min … max): 0.00% … 10.24% | |
# Time (median): 29.421 ms ┊ GC (median): 0.00% | |
# Time (mean ± σ): 32.225 ms ± 4.875 ms ┊ GC (mean ± σ): 5.79% ± 7.10% | |
# █▄ ▂▅ ▂ ▁ ▂ ▁ ▁ ▁ | |
# ▅▁▁▅▁▁▁▁▁▁▁██▆██▅▁▁▁▁▁▁▁▅▆▅▁█▆▇█▁▆██▆▇▅▁█▆▁█▆▁▁▁█▁▁▁▁▁▁▁▁▁█ ▅ | |
# 24.2 ms Histogram: log(frequency) by time 44.4 ms < | |
# Memory estimate: 30.55 MiB, allocs estimate: 15. | |
# GPU | |
# BenchmarkTools.Trial: 1657 samples with 1 evaluation. | |
# Range (min … max): 1.876 ms … 327.119 ms ┊ GC (min … max): 0.00% … 2.06% | |
# Time (median): 2.026 ms ┊ GC (median): 0.00% | |
# Time (mean ± σ): 3.012 ms ± 17.778 ms ┊ GC (mean ± σ): 0.67% ± 0.11% | |
# ▁▁▂▆▅▅▄▆▆▇▄▇▆▄▆▇█▅▅▂ | |
# ▃▆█████████████████████▅▄▃▃▃▃▃▃▂▃▃▂▁▂▁▂▂▁▁▁▂▂▁▂▁▁▁▁▁▁▂▁▂▂▁▂ ▄ | |
# 1.88 ms Histogram: frequency by time 2.54 ms < | |
# Memory estimate: 22.97 KiB, allocs estimate: 415. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment