Skip to content

Instantly share code, notes, and snippets.

@AlfTetzlaff
Last active December 15, 2020 11:15
Show Gist options
  • Save AlfTetzlaff/880ad7bf4e1486d8a051580a6262491c to your computer and use it in GitHub Desktop.
Save AlfTetzlaff/880ad7bf4e1486d8a051580a6262491c to your computer and use it in GitHub Desktop.
using CUDAnative
using CuArrays
using StaticArrays
using BenchmarkTools
using Makie
using Test
@inline function lennard_jones(dr, ϵ::T, σ::T) where T
invr2 = inv(sum(abs2, dr) + T(1E-7))
six_term = (σ ^ 2 * invr2) ^ 3
f = min(((24 * ϵ) * invr2) * (2 * six_term ^ 2 - six_term), T(100))
return f
end
@inline function pairwise_acceleration!(a, coords, i, j, f::Function)
dr = coords[j] - coords[i]
acc = f(dr) * dr
a[i] -= acc
a[j] += acc
return nothing
end
function accelerations(coords; ϵ=0.2, σ=1.)
N = length(coords)
a = zero(coords)
lj(dr) = lennard_jones(dr, ϵ, σ)
@inbounds for i in 1:N-1
for j in i+1:N
pairwise_acceleration!(a, coords, i, j, lj)
end
end
return a
end
function velocity_verlet(du0, u0, steps, dt, f::Function)
u = copy(u0)
du = copy(du0)
u_tmp = similar(u)
du_tmp = similar(du)
for _ in 1:steps
a = f(u)
u_tmp .= u + du*dt + 0.5a*dt^2
du .= du + 0.5(a + f(u_tmp))*dt
u .= u_tmp
end
return du, u
end
function calculate_forces!(r::CuDeviceVector{T}, a::CuDeviceVector{T}) where {T}
N = length(r)
gtid = (blockIdx().x - 1) * blockDim().x + threadIdx().x # global thread id
shared = @cuStaticSharedMem(T, 128)
tile = 0
pos = r[gtid]
acc = zero(T)
for i in 1:blockDim().x:N
idx = tile * blockDim().x + threadIdx().x
shared[threadIdx().x] = r[idx]
sync_threads()
@inbounds for j in 1:blockDim().x
dr = shared[j]-pos
acc -= lennard_jones(dr, 2f0, 1f0)*dr
end
sync_threads()
tile += 1
end
a[gtid] = acc
return nothing
end
function accelerations(coords::CuVector; ϵ=0.2f0, σ=1f0, nthreads=128)
N = length(coords)
if mod(N, nthreads) != 0
raise ArgumentError("Number of particles has to be a multiple of the number of threads.")
end
a_d = CuArray(zeros(SVector{3,Float32}, N)) # not optimal, since copying!
nblocks = N ÷ nthreads
CuArrays.@sync @cuda blocks=nblocks threads=nthreads calculate_forces!(coords, a_d)
return a_d
end
function main(scene, steps)
N = 4096*2*2
r0 = [SVector(50*rand(Float32, 3)...) for _ in 1:N]
r0_d = CuVector(r0)
v0 = zeros(SVector{3,Float32}, N)
v0_d = CuVector(v0)
res = accelerations(r0; σ=1f0, ϵ=2f0)
res_d = Array(accelerations(r0_d))
@test isapprox(res, res_d, rtol=1E-6)
du, u = v0_d, r0_d # Toggle between GPU and CPU
# du, u = v0, r0
img = meshscatter!(scene, Array(u), markersize=0.2)[end]
cameracontrols(scene).rotationspeed[] = 0.01
for t in 1:steps
if !isopen(scene)
break
end
# t0 = time_ns()
du, u = velocity_verlet(du, u, 1, 0.001f0, accelerations)
# r0, v0 = velocity_verlet(v0, r0, 1, 0.001f0, accelerations)
img[1] = Array(u)
# t1 = time_ns()
# println(string(round(1. /(Float64(t1-t0)/1E9), digits=1), " FPS"))
yield()
end
end
scene = Scene()
main(scene, 1E4)
###################################
# A little more sophisticated kernel which is applicable to an arbitrary number of atoms, accepts any force function
# and has a variable shared memory size / number of threads. To be used with the acceleration function at the bottom.
function nbodykernel!(r::CuDeviceVector{T}, a::CuDeviceVector{T}, f::Function, ::Val{TH}) where {T,TH}
N = length(r)
tid = threadIdx().x
gtid = (blockIdx().x - 1) * blockDim().x + tid # global thread id
shared = @cuStaticSharedMem(T, TH)
full_blocks = N ÷ blockDim().x
rest = N % blockDim().x
@inbounds begin
if gtid <= N
pos = r[gtid]
else
pos = zero(T)
end
acc = zero(T)
tile = 0
for i in 1:full_blocks
idx = tile * blockDim().x + tid
shared[tid] = r[idx]
sync_threads()
for j in 1:blockDim().x
dr = shared[j]-pos
acc -= f(dr)*dr
end
sync_threads()
tile += 1
end
if tid <= rest
idx = tile * blockDim().x + tid
shared[tid] = r[idx]
end
sync_threads()
for j in 1:rest
dr = shared[j]-pos
acc -= f(dr)*dr
end
sync_threads()
if gtid <= N
a[gtid] = acc
end
end #inbounds
return nothing
end
function accelerations(coords::CuVector; ϵ=2f0, σ=1f0, nthreads=128)
N = length(coords)
a_d = similar(coords)
@inline g(x) = lennard_jones(x, ϵ, σ)
CuArrays.@sync @cuda blocks=ceil(Int, N/nthreads) threads=nthreads nbodykernel!(coords, a_d, g, Val(nthreads))
return a_d
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment