Created
September 26, 2018 15:31
-
-
Save haampie/d6b7f3cdd9b10e5e07d802b2f2b68b56 to your computer and use it in GitHub Desktop.
fusing.jl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using LinearAlgebra | |
using LinearAlgebra: givensAlgorithm | |
using Test | |
using BenchmarkTools | |
import LinearAlgebra: rmul! | |
abstract type SmallRotation end | |
struct Rotation2{Tc,Ts} <: SmallRotation | |
c::Tc | |
s::Ts | |
i::Int | |
end | |
""" | |
A set of fused Givens rotations | |
G3 G4 | |
G1 G2 | |
""" | |
struct Fused2x2{Tc,Ts} <: SmallRotation | |
c1::Tc | |
s1::Ts | |
c2::Tc | |
s2::Ts | |
c3::Tc | |
s3::Ts | |
c4::Tc | |
s4::Ts | |
i::Int | |
end | |
@inline rmul!(A::AbstractMatrix, G::SmallRotation) = rmul!(A, G, axes(A, 1)) | |
@inline function rmul!(A::AbstractMatrix, G::Rotation2, range) | |
@inbounds for j = range | |
a₁ = A[j, G.i + 0] | |
a₂ = A[j, G.i + 1] | |
a₁′ = muladd( a₁, G.c, a₂ * G.s') | |
a₂′ = muladd(-a₁, G.s, a₂ * G.c ) | |
A[j, G.i+0] = a₁′ | |
A[j, G.i+1] = a₂′ | |
end | |
end | |
@inline function rmul!(A::AbstractMatrix, G::Fused2x2, range) | |
@inbounds for j = range | |
a0 = A[j, G.i + 0] | |
a1 = A[j, G.i + 1] | |
a2 = A[j, G.i + 2] | |
a3 = A[j, G.i + 3] | |
# Apply rotation 1 | |
a1′ = muladd( a1, G.c1, a2 * G.s1') | |
a2′ = muladd(-a1, G.s1, a2 * G.c1 ) | |
# Apply rotation 2 | |
a2′′ = muladd( a2′, G.c2, a3 * G.s2') | |
a3′′ = muladd(-a2′, G.s2, a3 * G.c2 ) | |
# Apply rotation 3 | |
a0′′′ = muladd( a0, G.c3, a1′ * G.s3') | |
a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 ) | |
# Apply rotation 4 | |
a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4') | |
a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 ) | |
A[j, G.i + 0] = a0′′′ | |
A[j, G.i + 1] = a1′′′′ | |
A[j, G.i + 2] = a2′′′′ | |
A[j, G.i + 3] = a3′′ | |
end | |
A | |
end | |
function generate_rotations(cols, ks) | |
# Generate some random rotations inbetween the columns | |
rotations = Matrix{Tuple{Float64,Float64}}(undef, ks, cols - 1) | |
for n = 1 : cols - 1, k = 1 : ks | |
c, s, = givensAlgorithm(rand(), rand()) | |
rotations[k, n] = (c, s) | |
end | |
rotations | |
end | |
function example(cols, ks, rows = 1000) | |
rotations = generate_rotations(cols, ks) | |
# Some matrix to apply them to | |
Q = rand(rows, cols) | |
Q1 = copy(Q) | |
Q2 = copy(Q) | |
Q3 = copy(Q) | |
# Apply the rotations to Q1 and Q2 | |
trivial_order!(Q1, rotations) | |
fused_trivial_order!(Q2, rotations) | |
fused_wave!(Q3, rotations) | |
@test Q1 ≈ Q2 ≈ Q3 # exact equality if we don't use FMA! | |
@test Q1 != Q # lets see if anything happened | |
end | |
function bench(ns = 100:100:2000, k = 16) | |
# Find the number of GFLOPS / second for the trivial vs fused method | |
GFLOPS_a, GFLOPS_b, GFLOPS_c = Float64[], Float64[], Float64[] | |
for n = ns | |
rotations = generate_rotations(n, k) | |
# 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to n rows | |
flops = 6k * n * (n - 1) | |
a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($n, $n)) | |
b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($n, $n)) | |
c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($n, $n)) | |
gflops_per_sec_a = flops / a / 1e9 | |
gflops_per_sec_b = flops / b / 1e9 | |
gflops_per_sec_c = flops / c / 1e9 | |
push!(GFLOPS_a, gflops_per_sec_a) | |
push!(GFLOPS_b, gflops_per_sec_b) | |
push!(GFLOPS_c, gflops_per_sec_c) | |
@info "Size ($n × $n) with $k layers" gflops_per_sec_a gflops_per_sec_b gflops_per_sec_c | |
end | |
return GFLOPS_a, GFLOPS_b, GFLOPS_c | |
end | |
function bench_k() | |
# simultaneous bulges | |
k = 128 | |
# matrix order | |
m = 3000 | |
# move them so many steps forward | |
n = 3k | |
rotations = generate_rotations(n, k) | |
# 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to m rows | |
flops = 6k * m * (n - 1) | |
a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($m, $n)) | |
b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($m, $n)) | |
c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($m, $n)) | |
gflops_per_sec_a = flops / a / 1e9 | |
gflops_per_sec_b = flops / b / 1e9 | |
gflops_per_sec_c = flops / c / 1e9 | |
return gflops_per_sec_a, gflops_per_sec_b, gflops_per_sec_c | |
end | |
""" | |
Apply all the rotations to Q in the trivial order | |
""" | |
function trivial_order!(Q, rotations) | |
@inbounds for k = axes(rotations, 1) | |
for n = axes(rotations, 2) | |
c, s = rotations[k, n] | |
G = Rotation2(c, s, n) | |
rmul!(Q, G) | |
end | |
end | |
Q | |
end | |
""" | |
Just fuse the rotations but don't do anything fancy | |
""" | |
function fused_trivial_order!(Q, rotations) | |
numrot = size(rotations, 2) | |
layers = size(rotations, 1) | |
@inbounds for k = 1 : 2 : layers | |
# First the left-over rotation | |
c, s = rotations[k, 1] | |
rmul!(Q, Rotation2(c, s, 1)) | |
# Then the fused rotations | |
for n = 1 : 2 : numrot - 2 | |
c1, s1 = rotations[k, n + 1] | |
c2, s2 = rotations[k, n + 2] | |
c3, s3 = rotations[k + 1, n + 0] | |
c4, s4 = rotations[k + 1, n + 1] | |
G = Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, n) | |
rmul!(Q, G) | |
end | |
# Then the last left-over rotation | |
c, s = rotations[k + 1, numrot] | |
rmul!(Q, Rotation2(c, s, numrot)) | |
end | |
Q | |
end | |
""" | |
Fuse the rotations AND apply them in a wave like fashion. The waves look like | |
this | |
|-----pipeline----|---shutdown--| | |
5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2 3 | |
4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2 | |
4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 | |
3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 | |
3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 | |
2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 | |
2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 | |
1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 | |
|---startup---|-----pipeline----| | |
""" | |
function fused_wave!(Q, rotations) | |
# number of columns | |
n = size(Q, 2) | |
# number of rows | |
m = size(Q, 1) | |
# number of layers of rotations | |
k = size(rotations, 1) | |
# startup | |
@inbounds for j = 1:2:k-1 | |
# fuse 4 givens rotations | |
for i = 1:2:j-2 | |
l = j - i + 1 | |
# Load and apply rotation in (i, l) | |
c1, s1 = rotations[i , l - 1] | |
c2, s2 = rotations[i , l - 0] | |
c3, s3 = rotations[i + 1, l - 2] | |
c4, s4 = rotations[i + 1, l - 1] | |
rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2)) | |
end | |
# take the last hanging givens rotation | |
c, s = rotations[j, 1] | |
rmul!(Q, Rotation2(c, s, 1)) | |
end | |
# pipeline | |
@inbounds for j = k + 1 : 2 : n - 1 | |
# fuse 4 givens rotations | |
for i = 1:2:k-1 | |
l = j - i + 1 | |
c1, s1 = rotations[i , l - 1] | |
c2, s2 = rotations[i , l - 0] | |
c3, s3 = rotations[i + 1, l - 2] | |
c4, s4 = rotations[i + 1, l - 1] | |
rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2)) | |
end | |
end | |
# shutdown | |
@inbounds for j = 1:2:k-1 | |
# hanging rotation | |
c, s = rotations[j + 1, n - 1] | |
rmul!(Q, Rotation2(c, s, n - 1)) | |
# fuse 4 givens rotations | |
for i = j + 2 : 2 : k - 1 | |
l = n + j - i + 1 | |
c1, s1 = rotations[i , l - 1] | |
c2, s2 = rotations[i , l - 0] | |
c3, s3 = rotations[i + 1, l - 2] | |
c4, s4 = rotations[i + 1, l - 1] | |
rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2)) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment