Last active
September 27, 2018 12:32
-
-
Save haampie/def0a6dbaf3d9ab3980ee8876c3d7be0 to your computer and use it in GitHub Desktop.
fusing_perf.jl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using BenchmarkTools | |
using LinearAlgebra | |
using LinearAlgebra: givensAlgorithm | |
""" | |
I want to apply 4 'fused' Givens rotations to 4 columns of matrix Q. Here Q | |
is a n x 4 matrix. In the benchmarks I compare the number of GFLOP/s when the | |
rotations are applied to Q directly (vertical) versus when Q is first | |
transposed (horizontal). | |
In the 'vertical' case: the access pattern is not contiguous. | |
In the 'horizontal' case: the access pattern is perfectly contiguous. | |
However, the generated code for the contiguous case does not use AVX operations. | |
My benchmark results: | |
(12.02936857562408, 39.85624546661375, 50.41209223009482) | |
So the non-contiguous example is 3.3x faster and comes to 79% of GEMM performance. | |
""" | |
function bench_panel(n = 256) | |
G = random_fused_rotations() | |
BLAS.set_num_threads(1) | |
maxflops = peakflops() / 1e9 | |
flop = 24 * n # 24 flop per row of Q | |
t1 = @belapsed apply_fused_packed_horizontal!(Q, $G) setup = (Q = rand(4, $n)) | |
t2 = @belapsed apply_fused_packed_vertical!(Q, $G) setup = (Q = rand($n, 4)) | |
return flop / t1 / 1e9, flop / t2 / 1e9, maxflops | |
end | |
struct Fused2x2{Tc,Ts} | |
c1::Tc | |
s1::Ts | |
c2::Tc | |
s2::Ts | |
c3::Tc | |
s3::Ts | |
c4::Tc | |
s4::Ts | |
end | |
generate_rotation() = givensAlgorithm(rand(), rand())[1:2] | |
random_fused_rotations() = Fused2x2(generate_rotation()...,generate_rotation()...,generate_rotation()...,generate_rotation()...) | |
@inline function kernel(a0, a1, a2, a3, G::Fused2x2) | |
# Apply rotation 1 | |
a1′ = muladd( a1, G.c1, a2 * G.s1') | |
a2′ = muladd(-a1, G.s1, a2 * G.c1 ) | |
# Apply rotation 2 | |
a2′′ = muladd( a2′, G.c2, a3 * G.s2') | |
a3′′ = muladd(-a2′, G.s2, a3 * G.c2 ) | |
# Apply rotation 3 | |
a0′′′ = muladd( a0, G.c3, a1′ * G.s3') | |
a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 ) | |
# Apply rotation 4 | |
a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4') | |
a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 ) | |
return a0′′′, a1′′′′, a2′′′′, a3′′ | |
end | |
function apply_fused_packed_horizontal!(Q, G) | |
@inbounds for j = axes(Q, 2) | |
a0 = Q[1, j] | |
a1 = Q[2, j] | |
a2 = Q[3, j] | |
a3 = Q[4, j] | |
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G) | |
Q[1, j] = a0′′′ | |
Q[2, j] = a1′′′′ | |
Q[3, j] = a2′′′′ | |
Q[4, j] = a3′′ | |
end | |
end | |
function apply_fused_packed_vertical!(Q, G) | |
@inbounds for j = axes(Q, 1) | |
a0 = Q[j, 1] | |
a1 = Q[j, 2] | |
a2 = Q[j, 3] | |
a3 = Q[j, 4] | |
a0′′′, a1′′′′, a2′′′′, a3′′ = kernel(a0, a1, a2, a3, G) | |
Q[j, 1] = a0′′′ | |
Q[j, 2] = a1′′′′ | |
Q[j, 3] = a2′′′′ | |
Q[j, 4] = a3′′ | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment