Last active
December 15, 2020 11:15
-
-
Save AlfTetzlaff/880ad7bf4e1486d8a051580a6262491c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using CUDAnative | |
using CuArrays | |
using StaticArrays | |
using BenchmarkTools | |
using Makie | |
using Test | |
@inline function lennard_jones(dr, ϵ::T, σ::T) where T | |
invr2 = inv(sum(abs2, dr) + T(1E-7)) | |
six_term = (σ ^ 2 * invr2) ^ 3 | |
f = min(((24 * ϵ) * invr2) * (2 * six_term ^ 2 - six_term), T(100)) | |
return f | |
end | |
@inline function pairwise_acceleration!(a, coords, i, j, f::Function) | |
dr = coords[j] - coords[i] | |
acc = f(dr) * dr | |
a[i] -= acc | |
a[j] += acc | |
return nothing | |
end | |
function accelerations(coords; ϵ=0.2, σ=1.) | |
N = length(coords) | |
a = zero(coords) | |
lj(dr) = lennard_jones(dr, ϵ, σ) | |
@inbounds for i in 1:N-1 | |
for j in i+1:N | |
pairwise_acceleration!(a, coords, i, j, lj) | |
end | |
end | |
return a | |
end | |
function velocity_verlet(du0, u0, steps, dt, f::Function) | |
u = copy(u0) | |
du = copy(du0) | |
u_tmp = similar(u) | |
du_tmp = similar(du) | |
for _ in 1:steps | |
a = f(u) | |
u_tmp .= u + du*dt + 0.5a*dt^2 | |
du .= du + 0.5(a + f(u_tmp))*dt | |
u .= u_tmp | |
end | |
return du, u | |
end | |
function calculate_forces!(r::CuDeviceVector{T}, a::CuDeviceVector{T}) where {T} | |
N = length(r) | |
gtid = (blockIdx().x - 1) * blockDim().x + threadIdx().x # global thread id | |
shared = @cuStaticSharedMem(T, 128) | |
tile = 0 | |
pos = r[gtid] | |
acc = zero(T) | |
for i in 1:blockDim().x:N | |
idx = tile * blockDim().x + threadIdx().x | |
shared[threadIdx().x] = r[idx] | |
sync_threads() | |
@inbounds for j in 1:blockDim().x | |
dr = shared[j]-pos | |
acc -= lennard_jones(dr, 2f0, 1f0)*dr | |
end | |
sync_threads() | |
tile += 1 | |
end | |
a[gtid] = acc | |
return nothing | |
end | |
function accelerations(coords::CuVector; ϵ=0.2f0, σ=1f0, nthreads=128) | |
N = length(coords) | |
if mod(N, nthreads) != 0 | |
raise ArgumentError("Number of particles has to be a multiple of the number of threads.") | |
end | |
a_d = CuArray(zeros(SVector{3,Float32}, N)) # not optimal, since copying! | |
nblocks = N ÷ nthreads | |
CuArrays.@sync @cuda blocks=nblocks threads=nthreads calculate_forces!(coords, a_d) | |
return a_d | |
end | |
function main(scene, steps) | |
N = 4096*2*2 | |
r0 = [SVector(50*rand(Float32, 3)...) for _ in 1:N] | |
r0_d = CuVector(r0) | |
v0 = zeros(SVector{3,Float32}, N) | |
v0_d = CuVector(v0) | |
res = accelerations(r0; σ=1f0, ϵ=2f0) | |
res_d = Array(accelerations(r0_d)) | |
@test isapprox(res, res_d, rtol=1E-6) | |
du, u = v0_d, r0_d # Toggle between GPU and CPU | |
# du, u = v0, r0 | |
img = meshscatter!(scene, Array(u), markersize=0.2)[end] | |
cameracontrols(scene).rotationspeed[] = 0.01 | |
for t in 1:steps | |
if !isopen(scene) | |
break | |
end | |
# t0 = time_ns() | |
du, u = velocity_verlet(du, u, 1, 0.001f0, accelerations) | |
# r0, v0 = velocity_verlet(v0, r0, 1, 0.001f0, accelerations) | |
img[1] = Array(u) | |
# t1 = time_ns() | |
# println(string(round(1. /(Float64(t1-t0)/1E9), digits=1), " FPS")) | |
yield() | |
end | |
end | |
scene = Scene() | |
main(scene, 1E4) | |
################################### | |
# A little more sophisticated kernel which is applicable to an arbitrary number of atoms, accepts any force function | |
# and has a variable shared memory size / number of threads. To be used with the acceleration function at the bottom. | |
function nbodykernel!(r::CuDeviceVector{T}, a::CuDeviceVector{T}, f::Function, ::Val{TH}) where {T,TH} | |
N = length(r) | |
tid = threadIdx().x | |
gtid = (blockIdx().x - 1) * blockDim().x + tid # global thread id | |
shared = @cuStaticSharedMem(T, TH) | |
full_blocks = N ÷ blockDim().x | |
rest = N % blockDim().x | |
@inbounds begin | |
if gtid <= N | |
pos = r[gtid] | |
else | |
pos = zero(T) | |
end | |
acc = zero(T) | |
tile = 0 | |
for i in 1:full_blocks | |
idx = tile * blockDim().x + tid | |
shared[tid] = r[idx] | |
sync_threads() | |
for j in 1:blockDim().x | |
dr = shared[j]-pos | |
acc -= f(dr)*dr | |
end | |
sync_threads() | |
tile += 1 | |
end | |
if tid <= rest | |
idx = tile * blockDim().x + tid | |
shared[tid] = r[idx] | |
end | |
sync_threads() | |
for j in 1:rest | |
dr = shared[j]-pos | |
acc -= f(dr)*dr | |
end | |
sync_threads() | |
if gtid <= N | |
a[gtid] = acc | |
end | |
end #inbounds | |
return nothing | |
end | |
function accelerations(coords::CuVector; ϵ=2f0, σ=1f0, nthreads=128) | |
N = length(coords) | |
a_d = similar(coords) | |
@inline g(x) = lennard_jones(x, ϵ, σ) | |
CuArrays.@sync @cuda blocks=ceil(Int, N/nthreads) threads=nthreads nbodykernel!(coords, a_d, g, Val(nthreads)) | |
return a_d | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment