haampie · October 6, 2018 17:35
diff --git a/spmvbench.jl b/spmvbench.jl
 """
 This is the implementation currently in SparseArrays
 """
 function simple_mul!(y, A, x)
    @inbounds for i = Base.OneTo(A.n)
        xi = x[i]

        for j = A.colptr[i] : A.colptr[i + 1] - 1
            y[A.rowval[j]] += A.nzval[j] * xi
        end
    end

    return y
 end

 """
 Same implementation as in base, but now with FMA -- barely any difference
 """
 function muladd_mul!(y, A, x)
    @inbounds for i = Base.OneTo(A.n)
        xi = x[i]

        for j = A.colptr[i] : A.colptr[i + 1] - 1
            y[A.rowval[j]] = muladd(A.nzval[j], xi, y[A.rowval[j]])
        end
    end

    return y
 end

 """
 This version does not construct a range for the inner loop, which can be a slight
 optimization when the number of nonzeros per column is very small
 """
 function optimizedinnerloop_mul!(y, A, x)
    j = 1

    @inbounds for i = Base.OneTo(A.n)
        xi = x[i]
        last_idx = A.colptr[i+1]
        while j != last_idx
            row = A.rowval[j]
            y[row] = muladd(A.nzval[j], xi, y[row])
            j += 1
        end
    end

    return y
 end

 """
 This version unrolls the inner loop to chunks of size N. This reduces the work of 
 bookkeeping a bit and should give an edge when the number of nonzeros per column is
 larger.
 """
 @generated unrolled_mul!(y, A, x, sz::Val{N}) where {N} = unrolled_mul_impl!(N)

 function unrolled_mul_impl!(N::Int)
    exprs = ntuple(i -> :(y[A.rowval[j+$(i-1)]] = muladd(A.nzval[j+$(i-1)], xi, y[A.rowval[j+$(i-1)]])), Val(N))
    return quote
        j = 1

        @inbounds for i = Base.OneTo(A.n)
            xi = x[i]
            last_idx = A.colptr[i+1]

            # N simultaneously
            while j + $N ≤ last_idx
                $(exprs...)
                j += $N
            end

            # Fix the remainder
            while j != last_idx
                y[A.rowval[j]] = muladd(A.nzval[j], xi, y[A.rowval[j]])
                j += 1
            end
        end

        return y
    end
 end

 ## Testing

 using Test, SparseArrays, LinearAlgebra

 function test_things()
    A = sprand(100, 100, 0.3)
    x = rand(100)
    y1 = rand(100)
    y2 = copy(y1) 
    y3 = copy(y1)
    y4 = copy(y1)
    y5 = copy(y1)

    simple_mul!(y1, A, x)
    muladd_mul!(y2, A, x)
    optimizedinnerloop_mul!(y3, A, x)
    unrolled_mul!(y4, A, x, Val{2}())
    unrolled_mul!(y5, A, x, Val{4}())

    @test y1 ≈ y2 ≈ y3 ≈ y4 ≈ y5
 end

 ## benching

 using BenchmarkTools

 function bench_increasing_bandwidth(n = 1000, ks = 0:10, ::Type{T} = Float64) where {T}
    results = ntuple(i -> Float64[], 4)

    for k = ks
        A = spdiagm((i => rand(T, n - abs(i)) for i = -k:k)...)
        n = size(A, 1)
        x = rand(T, n)
        y = rand(T, n)

        flop = 2nnz(A)

        @info "Benchmarking" k flop

        push!(results[1], flop / @belapsed(simple_mul!($y, $A, $x)) / 1e9)
        push!(results[2], flop / @belapsed(optimizedinnerloop_mul!($y, $A, $x)) / 1e9)
        push!(results[3], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{2}()))) / 1e9)
        push!(results[4], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{4}()))) / 1e9)

        @show last.(results)
    end

    results
 end

 function bench_increasing_density(n = 1000, ks = 0:10, ::Type{T} = Float64) where {T}
    results = ntuple(i -> Float64[], 4)

    for k = ks
        A = sprand(n, n, (2k+1) / n)
        n = size(A, 1)
        x = rand(T, n)
        y = rand(T, n)

        flop = 2nnz(A)

        @info "Benchmarking" k flop

        push!(results[1], flop / @belapsed(simple_mul!($y, $A, $x)) / 1e9)
        push!(results[2], flop / @belapsed(optimizedinnerloop_mul!($y, $A, $x)) / 1e9)
        push!(results[3], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{2}()))) / 1e9)
        push!(results[4], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{4}()))) / 1e9)

        @show last.(results)
    end

    results
 end
	"""
	This is the implementation currently in SparseArrays
	"""
	function simple_mul!(y, A, x)
	@inbounds for i = Base.OneTo(A.n)
	xi = x[i]

	for j = A.colptr[i] : A.colptr[i + 1] - 1
	y[A.rowval[j]] += A.nzval[j] * xi
	end
	end

	return y
	end

	"""
	Same implementation as in base, but now with FMA -- barely any difference
	"""
	function muladd_mul!(y, A, x)
	@inbounds for i = Base.OneTo(A.n)
	xi = x[i]

	for j = A.colptr[i] : A.colptr[i + 1] - 1
	y[A.rowval[j]] = muladd(A.nzval[j], xi, y[A.rowval[j]])
	end
	end

	return y
	end

	"""
	This version does not construct a range for the inner loop, which can be a slight
	optimization when the number of nonzeros per column is very small
	"""
	function optimizedinnerloop_mul!(y, A, x)
	j = 1

	@inbounds for i = Base.OneTo(A.n)
	xi = x[i]
	last_idx = A.colptr[i+1]
	while j != last_idx
	row = A.rowval[j]
	y[row] = muladd(A.nzval[j], xi, y[row])
	j += 1
	end
	end

	return y
	end

	"""
	This version unrolls the inner loop to chunks of size N. This reduces the work of
	bookkeeping a bit and should give an edge when the number of nonzeros per column is
	larger.
	"""
	@generated unrolled_mul!(y, A, x, sz::Val{N}) where {N} = unrolled_mul_impl!(N)

	function unrolled_mul_impl!(N::Int)
	exprs = ntuple(i -> :(y[A.rowval[j+$(i-1)]] = muladd(A.nzval[j+$(i-1)], xi, y[A.rowval[j+$(i-1)]])), Val(N))
	return quote
	j = 1

	@inbounds for i = Base.OneTo(A.n)
	xi = x[i]
	last_idx = A.colptr[i+1]

	# N simultaneously
	while j + $N ≤ last_idx
	$(exprs...)
	j += $N
	end

	# Fix the remainder
	while j != last_idx
	y[A.rowval[j]] = muladd(A.nzval[j], xi, y[A.rowval[j]])
	j += 1
	end
	end

	return y
	end
	end

	## Testing

	using Test, SparseArrays, LinearAlgebra

	function test_things()
	A = sprand(100, 100, 0.3)
	x = rand(100)
	y1 = rand(100)
	y2 = copy(y1)
	y3 = copy(y1)
	y4 = copy(y1)
	y5 = copy(y1)

	simple_mul!(y1, A, x)
	muladd_mul!(y2, A, x)
	optimizedinnerloop_mul!(y3, A, x)
	unrolled_mul!(y4, A, x, Val{2}())
	unrolled_mul!(y5, A, x, Val{4}())

	@test y1 ≈ y2 ≈ y3 ≈ y4 ≈ y5
	end

	## benching

	using BenchmarkTools

	function bench_increasing_bandwidth(n = 1000, ks = 0:10, ::Type{T} = Float64) where {T}
	results = ntuple(i -> Float64[], 4)

	for k = ks
	A = spdiagm((i => rand(T, n - abs(i)) for i = -k:k)...)
	n = size(A, 1)
	x = rand(T, n)
	y = rand(T, n)

	flop = 2nnz(A)

	@info "Benchmarking" k flop

	push!(results[1], flop / @belapsed(simple_mul!($y, $A, $x)) / 1e9)
	push!(results[2], flop / @belapsed(optimizedinnerloop_mul!($y, $A, $x)) / 1e9)
	push!(results[3], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{2}()))) / 1e9)
	push!(results[4], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{4}()))) / 1e9)

	@show last.(results)
	end

	results
	end

	function bench_increasing_density(n = 1000, ks = 0:10, ::Type{T} = Float64) where {T}
	results = ntuple(i -> Float64[], 4)

	for k = ks
	A = sprand(n, n, (2k+1) / n)
	n = size(A, 1)
	x = rand(T, n)
	y = rand(T, n)

	flop = 2nnz(A)

	@info "Benchmarking" k flop

	push!(results[1], flop / @belapsed(simple_mul!($y, $A, $x)) / 1e9)
	push!(results[2], flop / @belapsed(optimizedinnerloop_mul!($y, $A, $x)) / 1e9)
	push!(results[3], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{2}()))) / 1e9)
	push!(results[4], flop / @belapsed(unrolled_mul!($y, $A, $x, $(Val{4}()))) / 1e9)

	@show last.(results)
	end

	results
	end