Skip to content

Instantly share code, notes, and snippets.

@haampie
Created September 26, 2018 15:31
Show Gist options
  • Save haampie/d6b7f3cdd9b10e5e07d802b2f2b68b56 to your computer and use it in GitHub Desktop.
Save haampie/d6b7f3cdd9b10e5e07d802b2f2b68b56 to your computer and use it in GitHub Desktop.
fusing.jl
using LinearAlgebra
using LinearAlgebra: givensAlgorithm
using Test
using BenchmarkTools
import LinearAlgebra: rmul!
abstract type SmallRotation end
struct Rotation2{Tc,Ts} <: SmallRotation
c::Tc
s::Ts
i::Int
end
"""
A set of fused Givens rotations
G3 G4
G1 G2
"""
struct Fused2x2{Tc,Ts} <: SmallRotation
c1::Tc
s1::Ts
c2::Tc
s2::Ts
c3::Tc
s3::Ts
c4::Tc
s4::Ts
i::Int
end
@inline rmul!(A::AbstractMatrix, G::SmallRotation) = rmul!(A, G, axes(A, 1))
@inline function rmul!(A::AbstractMatrix, G::Rotation2, range)
@inbounds for j = range
a₁ = A[j, G.i + 0]
a₂ = A[j, G.i + 1]
a₁′ = muladd( a₁, G.c, a₂ * G.s')
a₂′ = muladd(-a₁, G.s, a₂ * G.c )
A[j, G.i+0] = a₁′
A[j, G.i+1] = a₂′
end
end
@inline function rmul!(A::AbstractMatrix, G::Fused2x2, range)
@inbounds for j = range
a0 = A[j, G.i + 0]
a1 = A[j, G.i + 1]
a2 = A[j, G.i + 2]
a3 = A[j, G.i + 3]
# Apply rotation 1
a1′ = muladd( a1, G.c1, a2 * G.s1')
a2′ = muladd(-a1, G.s1, a2 * G.c1 )
# Apply rotation 2
a2′′ = muladd( a2′, G.c2, a3 * G.s2')
a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )
# Apply rotation 3
a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )
# Apply rotation 4
a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )
A[j, G.i + 0] = a0′′′
A[j, G.i + 1] = a1′′′′
A[j, G.i + 2] = a2′′′′
A[j, G.i + 3] = a3′′
end
A
end
function generate_rotations(cols, ks)
# Generate some random rotations inbetween the columns
rotations = Matrix{Tuple{Float64,Float64}}(undef, ks, cols - 1)
for n = 1 : cols - 1, k = 1 : ks
c, s, = givensAlgorithm(rand(), rand())
rotations[k, n] = (c, s)
end
rotations
end
function example(cols, ks, rows = 1000)
rotations = generate_rotations(cols, ks)
# Some matrix to apply them to
Q = rand(rows, cols)
Q1 = copy(Q)
Q2 = copy(Q)
Q3 = copy(Q)
# Apply the rotations to Q1 and Q2
trivial_order!(Q1, rotations)
fused_trivial_order!(Q2, rotations)
fused_wave!(Q3, rotations)
@test Q1 ≈ Q2 ≈ Q3 # exact equality if we don't use FMA!
@test Q1 != Q # lets see if anything happened
end
function bench(ns = 100:100:2000, k = 16)
# Find the number of GFLOPS / second for the trivial vs fused method
GFLOPS_a, GFLOPS_b, GFLOPS_c = Float64[], Float64[], Float64[]
for n = ns
rotations = generate_rotations(n, k)
# 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to n rows
flops = 6k * n * (n - 1)
a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($n, $n))
b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($n, $n))
c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($n, $n))
gflops_per_sec_a = flops / a / 1e9
gflops_per_sec_b = flops / b / 1e9
gflops_per_sec_c = flops / c / 1e9
push!(GFLOPS_a, gflops_per_sec_a)
push!(GFLOPS_b, gflops_per_sec_b)
push!(GFLOPS_c, gflops_per_sec_c)
@info "Size ($n × $n) with $k layers" gflops_per_sec_a gflops_per_sec_b gflops_per_sec_c
end
return GFLOPS_a, GFLOPS_b, GFLOPS_c
end
function bench_k()
# simultaneous bulges
k = 128
# matrix order
m = 3000
# move them so many steps forward
n = 3k
rotations = generate_rotations(n, k)
# 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to m rows
flops = 6k * m * (n - 1)
a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($m, $n))
b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($m, $n))
c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($m, $n))
gflops_per_sec_a = flops / a / 1e9
gflops_per_sec_b = flops / b / 1e9
gflops_per_sec_c = flops / c / 1e9
return gflops_per_sec_a, gflops_per_sec_b, gflops_per_sec_c
end
"""
Apply all the rotations to Q in the trivial order
"""
function trivial_order!(Q, rotations)
@inbounds for k = axes(rotations, 1)
for n = axes(rotations, 2)
c, s = rotations[k, n]
G = Rotation2(c, s, n)
rmul!(Q, G)
end
end
Q
end
"""
Just fuse the rotations but don't do anything fancy
"""
function fused_trivial_order!(Q, rotations)
numrot = size(rotations, 2)
layers = size(rotations, 1)
@inbounds for k = 1 : 2 : layers
# First the left-over rotation
c, s = rotations[k, 1]
rmul!(Q, Rotation2(c, s, 1))
# Then the fused rotations
for n = 1 : 2 : numrot - 2
c1, s1 = rotations[k, n + 1]
c2, s2 = rotations[k, n + 2]
c3, s3 = rotations[k + 1, n + 0]
c4, s4 = rotations[k + 1, n + 1]
G = Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, n)
rmul!(Q, G)
end
# Then the last left-over rotation
c, s = rotations[k + 1, numrot]
rmul!(Q, Rotation2(c, s, numrot))
end
Q
end
"""
Fuse the rotations AND apply them in a wave like fashion. The waves look like
this
|-----pipeline----|---shutdown--|
5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2 3
4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2
4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2
3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1
3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1
2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0
2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0
1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
|---startup---|-----pipeline----|
"""
function fused_wave!(Q, rotations)
# number of columns
n = size(Q, 2)
# number of rows
m = size(Q, 1)
# number of layers of rotations
k = size(rotations, 1)
# startup
@inbounds for j = 1:2:k-1
# fuse 4 givens rotations
for i = 1:2:j-2
l = j - i + 1
# Load and apply rotation in (i, l)
c1, s1 = rotations[i , l - 1]
c2, s2 = rotations[i , l - 0]
c3, s3 = rotations[i + 1, l - 2]
c4, s4 = rotations[i + 1, l - 1]
rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
end
# take the last hanging givens rotation
c, s = rotations[j, 1]
rmul!(Q, Rotation2(c, s, 1))
end
# pipeline
@inbounds for j = k + 1 : 2 : n - 1
# fuse 4 givens rotations
for i = 1:2:k-1
l = j - i + 1
c1, s1 = rotations[i , l - 1]
c2, s2 = rotations[i , l - 0]
c3, s3 = rotations[i + 1, l - 2]
c4, s4 = rotations[i + 1, l - 1]
rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
end
end
# shutdown
@inbounds for j = 1:2:k-1
# hanging rotation
c, s = rotations[j + 1, n - 1]
rmul!(Q, Rotation2(c, s, n - 1))
# fuse 4 givens rotations
for i = j + 2 : 2 : k - 1
l = n + j - i + 1
c1, s1 = rotations[i , l - 1]
c2, s2 = rotations[i , l - 0]
c3, s3 = rotations[i + 1, l - 2]
c4, s4 = rotations[i + 1, l - 1]
rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment