Skip to content

Instantly share code, notes, and snippets.

@haampie
Created October 6, 2018 17:35
Show Gist options
  • Save haampie/e0d7051bc67aeda03661ef429cf1f4a0 to your computer and use it in GitHub Desktop.
Save haampie/e0d7051bc67aeda03661ef429cf1f4a0 to your computer and use it in GitHub Desktop.
spmvbench.jl
"""
This is the implementation currently in SparseArrays
"""
function simple_mul!(y, A, x)
@inbounds for i = Base.OneTo(A.n)
xi = x[i]
for j = A.colptr[i] : A.colptr[i + 1] - 1
y[A.rowval[j]] += A.nzval[j] * xi
end
end
return y
end
"""
Same implementation as in base, but now with FMA -- barely any difference
"""
function muladd_mul!(y, A, x)
@inbounds for i = Base.OneTo(A.n)
xi = x[i]
for j = A.colptr[i] : A.colptr[i + 1] - 1
y[A.rowval[j]] = muladd(A.nzval[j], xi, y[A.rowval[j]])
end
end
return y
end
"""
This version does not construct a range for the inner loop, which can be a slight
optimization when the number of nonzeros per column is very small
"""
function optimizedinnerloop_mul!(y, A, x)
j = 1
@inbounds for i = Base.OneTo(A.n)
xi = x[i]
last_idx = A.colptr[i+1]
while j != last_idx
row = A.rowval[j]
y[row] = muladd(A.nzval[j], xi, y[row])
j += 1
end
end
return y
end
"""
This version unrolls the inner loop to chunks of size N. This reduces the work of
bookkeeping a bit and should give an edge when the number of nonzeros per column is
larger.
"""
@generated unrolled_mul!(y, A, x, sz::Val{N}) where {N} = unrolled_mul_impl!(N)
function unrolled_mul_impl!(N::Int)
exprs = ntuple(i -> :(y[A.rowval[j+$(i-1)]] = muladd(A.nzval[j+$(i-1)], xi, y[A.rowval[j+$(i-1)]])), Val(N))
return quote
j = 1
@inbounds for i = Base.OneTo(A.n)
xi = x[i]
last_idx = A.colptr[i+1]
# N simultaneously
while j + $N ≤ last_idx
$(exprs...)
j += $N
end
# Fix the remainder
while j != last_idx
y[A.rowval[j]] = muladd(A.nzval[j], xi, y[A.rowval[j]])
j += 1
end
end
return y
end
end
## Testing
using Test, SparseArrays, LinearAlgebra
function test_things()
A = sprand(100, 100, 0.3)
x = rand(100)
y1 = rand(100)
y2 = copy(y1)
y3 = copy(y1)
y4 = copy(y1)
y5 = copy(y1)
simple_mul!(y1, A, x)
muladd_mul!(y2, A, x)
optimizedinnerloop_mul!(y3, A, x)
unrolled_mul!(y4, A, x, Val{2}())
unrolled_mul!(y5, A, x, Val{4}())
@test y1 ≈ y2 ≈ y3 ≈ y4 ≈ y5
end
## benching
using BenchmarkTools
function bench_increasing_bandwidth(n = 1000, ks = 0:10, ::Type{T} = Float64) where {T}
results = ntuple(i -> Float64[], 4)
for k = ks
A = spdiagm((i => rand(T, n - abs(i)) for i = -k:k)...)
n = size(A, 1)
x = rand(T, n)
y = rand(T, n)
flop = 2nnz(A)
@info "Benchmarking" k flop
push!(results[1], flop / @belapsed(simple_mul!($y, $A, $x)) / 1e9)
push!(results[2], flop / @belapsed(optimizedinnerloop_mul!($y, $A, $x)) / 1e9)
push!(results[3], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{2}()))) / 1e9)
push!(results[4], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{4}()))) / 1e9)
@show last.(results)
end
results
end
function bench_increasing_density(n = 1000, ks = 0:10, ::Type{T} = Float64) where {T}
results = ntuple(i -> Float64[], 4)
for k = ks
A = sprand(n, n, (2k+1) / n)
n = size(A, 1)
x = rand(T, n)
y = rand(T, n)
flop = 2nnz(A)
@info "Benchmarking" k flop
push!(results[1], flop / @belapsed(simple_mul!($y, $A, $x)) / 1e9)
push!(results[2], flop / @belapsed(optimizedinnerloop_mul!($y, $A, $x)) / 1e9)
push!(results[3], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{2}()))) / 1e9)
push!(results[4], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{4}()))) / 1e9)
@show last.(results)
end
results
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment