haampie · September 26, 2018 15:31
diff --git a/fusing.jl b/fusing.jl
 using LinearAlgebra
 using LinearAlgebra: givensAlgorithm
 using Test
 using BenchmarkTools

 import LinearAlgebra: rmul!

 abstract type SmallRotation end

 struct Rotation2{Tc,Ts} <: SmallRotation
    c::Tc
    s::Ts
    i::Int
 end

 """
 A set of fused Givens rotations
 G3 G4
  G1 G2
 """
 struct Fused2x2{Tc,Ts} <: SmallRotation
    c1::Tc
    s1::Ts
    c2::Tc
    s2::Ts
    c3::Tc
    s3::Ts
    c4::Tc
    s4::Ts
    i::Int
 end

 @inline rmul!(A::AbstractMatrix, G::SmallRotation) = rmul!(A, G, axes(A, 1))

 @inline function rmul!(A::AbstractMatrix, G::Rotation2, range)
    @inbounds for j = range
        a₁ = A[j, G.i + 0]
        a₂ = A[j, G.i + 1]

        a₁′ = muladd( a₁, G.c, a₂ * G.s')
        a₂′ = muladd(-a₁, G.s, a₂ * G.c )

        A[j, G.i+0] = a₁′
        A[j, G.i+1] = a₂′
    end
 end

 @inline function rmul!(A::AbstractMatrix, G::Fused2x2, range)
    @inbounds for j = range
        a0 = A[j, G.i + 0]
        a1 = A[j, G.i + 1]
        a2 = A[j, G.i + 2]
        a3 = A[j, G.i + 3]

        # Apply rotation 1
        a1′ = muladd( a1, G.c1, a2 * G.s1')
        a2′ = muladd(-a1, G.s1, a2 * G.c1 )

        # Apply rotation 2
        a2′′ = muladd( a2′, G.c2, a3 * G.s2')
        a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )

        # Apply rotation 3
        a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
        a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )

        # Apply rotation 4
        a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
        a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )

        A[j, G.i + 0] = a0′′′
        A[j, G.i + 1] = a1′′′′
        A[j, G.i + 2] = a2′′′′
        A[j, G.i + 3] = a3′′
    end

    A
 end

 function generate_rotations(cols, ks)
    # Generate some random rotations inbetween the columns
    rotations = Matrix{Tuple{Float64,Float64}}(undef, ks, cols - 1)
    for n = 1 : cols - 1, k = 1 : ks
        c, s, = givensAlgorithm(rand(), rand())
        rotations[k, n] = (c, s)
    end
    rotations
 end

 function example(cols, ks, rows = 1000)
    rotations = generate_rotations(cols, ks)    

    # Some matrix to apply them to
    Q = rand(rows, cols)
    Q1 = copy(Q)
    Q2 = copy(Q)
    Q3 = copy(Q)

    # Apply the rotations to Q1 and Q2
    trivial_order!(Q1, rotations)
    fused_trivial_order!(Q2, rotations)
    fused_wave!(Q3, rotations)

    @test Q1 ≈ Q2 ≈ Q3 # exact equality if we don't use FMA!
    @test Q1 != Q # lets see if anything happened
 end

 function bench(ns = 100:100:2000, k = 16)
    # Find the number of GFLOPS / second for the trivial vs fused method
    GFLOPS_a, GFLOPS_b, GFLOPS_c = Float64[], Float64[], Float64[]

    for n = ns
        rotations = generate_rotations(n, k)

        # 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to n rows
        flops = 6k * n * (n - 1)

        a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($n, $n))
        b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($n, $n))
        c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($n, $n))

        gflops_per_sec_a = flops / a / 1e9
        gflops_per_sec_b = flops / b / 1e9
        gflops_per_sec_c = flops / c / 1e9

        push!(GFLOPS_a, gflops_per_sec_a)
        push!(GFLOPS_b, gflops_per_sec_b)
        push!(GFLOPS_c, gflops_per_sec_c)

        @info "Size ($n × $n) with $k layers" gflops_per_sec_a gflops_per_sec_b gflops_per_sec_c
    end

    return GFLOPS_a, GFLOPS_b, GFLOPS_c
 end

 function bench_k()
    # simultaneous bulges
    k = 128

    # matrix order
    m = 3000

    # move them so many steps forward
    n = 3k

    rotations = generate_rotations(n, k)

    # 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to m rows
    flops = 6k * m * (n - 1)

    a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($m, $n))
    b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($m, $n))
    c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($m, $n))

    gflops_per_sec_a = flops / a / 1e9
    gflops_per_sec_b = flops / b / 1e9
    gflops_per_sec_c = flops / c / 1e9

    return gflops_per_sec_a, gflops_per_sec_b, gflops_per_sec_c
 end

 """
 Apply all the rotations to Q in the trivial order
 """
 function trivial_order!(Q, rotations)
    @inbounds for k = axes(rotations, 1)
        for n = axes(rotations, 2)
            c, s = rotations[k, n]
            G = Rotation2(c, s, n)
            rmul!(Q, G)
        end
    end

    Q
 end

 """
 Just fuse the rotations but don't do anything fancy
 """
 function fused_trivial_order!(Q, rotations)
    numrot = size(rotations, 2)
    layers = size(rotations, 1)

    @inbounds for k = 1 : 2 : layers
        # First the left-over rotation
        c, s = rotations[k, 1]
        rmul!(Q, Rotation2(c, s, 1))

        # Then the fused rotations
        for n = 1 : 2 : numrot - 2
            c1, s1 = rotations[k, n + 1]
            c2, s2 = rotations[k, n + 2]
            c3, s3 = rotations[k + 1, n + 0]
            c4, s4 = rotations[k + 1, n + 1]
            G = Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, n)
            rmul!(Q, G)
        end

        # Then the last left-over rotation
        c, s = rotations[k + 1, numrot]
        rmul!(Q, Rotation2(c, s, numrot))
    end

    Q
 end

 """
 Fuse the rotations AND apply them in a wave like fashion. The waves look like
 this
 |-----pipeline----|---shutdown--|
 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2 3
 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2
 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2
 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1
 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1
 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0
 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0
 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
 |---startup---|-----pipeline----|
 """
 function fused_wave!(Q, rotations)
    # number of columns
    n = size(Q, 2)
    
    # number of rows
    m = size(Q, 1)

    # number of layers of rotations
    k = size(rotations, 1)

    # startup
    @inbounds for j = 1:2:k-1
        # fuse 4 givens rotations
        for i = 1:2:j-2
            l = j - i  + 1

            # Load and apply rotation in (i, l)
            c1, s1 = rotations[i    , l - 1]
            c2, s2 = rotations[i    , l - 0]
            c3, s3 = rotations[i + 1, l - 2]
            c4, s4 = rotations[i + 1, l - 1]
            rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
        end

        # take the last hanging givens rotation
        c, s = rotations[j, 1]
        rmul!(Q, Rotation2(c, s, 1))
    end

    # pipeline
    @inbounds for j = k + 1 : 2 : n - 1
        # fuse 4 givens rotations
        for i = 1:2:k-1
            l = j - i + 1
            c1, s1 = rotations[i    , l - 1]
            c2, s2 = rotations[i    , l - 0]
            c3, s3 = rotations[i + 1, l - 2]
            c4, s4 = rotations[i + 1, l - 1]
            rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
        end
    end

    # shutdown
    @inbounds for j = 1:2:k-1
        # hanging rotation
        c, s = rotations[j + 1, n - 1]
        rmul!(Q, Rotation2(c, s, n - 1))

        # fuse 4 givens rotations
        for i = j + 2 : 2 : k - 1
            l = n + j - i + 1
            c1, s1 = rotations[i    , l - 1]
            c2, s2 = rotations[i    , l - 0]
            c3, s3 = rotations[i + 1, l - 2]
            c4, s4 = rotations[i + 1, l - 1]
            rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
        end
    end
 end
	using LinearAlgebra
	using LinearAlgebra: givensAlgorithm
	using Test
	using BenchmarkTools

	import LinearAlgebra: rmul!

	abstract type SmallRotation end

	struct Rotation2{Tc,Ts} <: SmallRotation
	c::Tc
	s::Ts
	i::Int
	end

	"""
	A set of fused Givens rotations
	G3 G4
	G1 G2
	"""
	struct Fused2x2{Tc,Ts} <: SmallRotation
	c1::Tc
	s1::Ts
	c2::Tc
	s2::Ts
	c3::Tc
	s3::Ts
	c4::Tc
	s4::Ts
	i::Int
	end

	@inline rmul!(A::AbstractMatrix, G::SmallRotation) = rmul!(A, G, axes(A, 1))

	@inline function rmul!(A::AbstractMatrix, G::Rotation2, range)
	@inbounds for j = range
	a₁ = A[j, G.i + 0]
	a₂ = A[j, G.i + 1]

	a₁′ = muladd( a₁, G.c, a₂ * G.s')
	a₂′ = muladd(-a₁, G.s, a₂ * G.c )

	A[j, G.i+0] = a₁′
	A[j, G.i+1] = a₂′
	end
	end

	@inline function rmul!(A::AbstractMatrix, G::Fused2x2, range)
	@inbounds for j = range
	a0 = A[j, G.i + 0]
	a1 = A[j, G.i + 1]
	a2 = A[j, G.i + 2]
	a3 = A[j, G.i + 3]

	# Apply rotation 1
	a1′ = muladd( a1, G.c1, a2 * G.s1')
	a2′ = muladd(-a1, G.s1, a2 * G.c1 )

	# Apply rotation 2
	a2′′ = muladd( a2′, G.c2, a3 * G.s2')
	a3′′ = muladd(-a2′, G.s2, a3 * G.c2 )

	# Apply rotation 3
	a0′′′ = muladd( a0, G.c3, a1′ * G.s3')
	a1′′′ = muladd(-a0, G.s3, a1′ * G.c3 )

	# Apply rotation 4
	a1′′′′ = muladd( a1′′′, G.c4, a2′′ * G.s4')
	a2′′′′ = muladd(-a1′′′, G.s4, a2′′ * G.c4 )

	A[j, G.i + 0] = a0′′′
	A[j, G.i + 1] = a1′′′′
	A[j, G.i + 2] = a2′′′′
	A[j, G.i + 3] = a3′′
	end

	A
	end

	function generate_rotations(cols, ks)
	# Generate some random rotations inbetween the columns
	rotations = Matrix{Tuple{Float64,Float64}}(undef, ks, cols - 1)
	for n = 1 : cols - 1, k = 1 : ks
	c, s, = givensAlgorithm(rand(), rand())
	rotations[k, n] = (c, s)
	end
	rotations
	end

	function example(cols, ks, rows = 1000)
	rotations = generate_rotations(cols, ks)

	# Some matrix to apply them to
	Q = rand(rows, cols)
	Q1 = copy(Q)
	Q2 = copy(Q)
	Q3 = copy(Q)

	# Apply the rotations to Q1 and Q2
	trivial_order!(Q1, rotations)
	fused_trivial_order!(Q2, rotations)
	fused_wave!(Q3, rotations)

	@test Q1 ≈ Q2 ≈ Q3 # exact equality if we don't use FMA!
	@test Q1 != Q # lets see if anything happened
	end

	function bench(ns = 100:100:2000, k = 16)
	# Find the number of GFLOPS / second for the trivial vs fused method
	GFLOPS_a, GFLOPS_b, GFLOPS_c = Float64[], Float64[], Float64[]

	for n = ns
	rotations = generate_rotations(n, k)

	# 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to n rows
	flops = 6k * n * (n - 1)

	a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($n, $n))
	b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($n, $n))
	c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($n, $n))

	gflops_per_sec_a = flops / a / 1e9
	gflops_per_sec_b = flops / b / 1e9
	gflops_per_sec_c = flops / c / 1e9

	push!(GFLOPS_a, gflops_per_sec_a)
	push!(GFLOPS_b, gflops_per_sec_b)
	push!(GFLOPS_c, gflops_per_sec_c)

	@info "Size ($n × $n) with $k layers" gflops_per_sec_a gflops_per_sec_b gflops_per_sec_c
	end

	return GFLOPS_a, GFLOPS_b, GFLOPS_c
	end

	function bench_k()
	# simultaneous bulges
	k = 128

	# matrix order
	m = 3000

	# move them so many steps forward
	n = 3k

	rotations = generate_rotations(n, k)

	# 1 rotation = 6 flops; n - 1 rotations per layer; k layers; applied to m rows
	flops = 6k * m * (n - 1)

	a = @belapsed trivial_order!(Q, $rotations) setup = (Q = rand($m, $n))
	b = @belapsed fused_trivial_order!(Q, $rotations) setup = (Q = rand($m, $n))
	c = @belapsed fused_wave!(Q, $rotations) setup = (Q = rand($m, $n))

	gflops_per_sec_a = flops / a / 1e9
	gflops_per_sec_b = flops / b / 1e9
	gflops_per_sec_c = flops / c / 1e9

	return gflops_per_sec_a, gflops_per_sec_b, gflops_per_sec_c
	end

	"""
	Apply all the rotations to Q in the trivial order
	"""
	function trivial_order!(Q, rotations)
	@inbounds for k = axes(rotations, 1)
	for n = axes(rotations, 2)
	c, s = rotations[k, n]
	G = Rotation2(c, s, n)
	rmul!(Q, G)
	end
	end

	Q
	end

	"""
	Just fuse the rotations but don't do anything fancy
	"""
	function fused_trivial_order!(Q, rotations)
	numrot = size(rotations, 2)
	layers = size(rotations, 1)

	@inbounds for k = 1 : 2 : layers
	# First the left-over rotation
	c, s = rotations[k, 1]
	rmul!(Q, Rotation2(c, s, 1))

	# Then the fused rotations
	for n = 1 : 2 : numrot - 2
	c1, s1 = rotations[k, n + 1]
	c2, s2 = rotations[k, n + 2]
	c3, s3 = rotations[k + 1, n + 0]
	c4, s4 = rotations[k + 1, n + 1]
	G = Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, n)
	rmul!(Q, G)
	end

	# Then the last left-over rotation
	c, s = rotations[k + 1, numrot]
	rmul!(Q, Rotation2(c, s, numrot))
	end

	Q
	end

	"""
	Fuse the rotations AND apply them in a wave like fashion. The waves look like
	this
	\|-----pipeline----\|---shutdown--\|
	5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2 3
	4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2 2
	4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1 2
	3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1 1
	3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0 1
	2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0 0
	2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 0
	1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
	\|---startup---\|-----pipeline----\|
	"""
	function fused_wave!(Q, rotations)
	# number of columns
	n = size(Q, 2)

	# number of rows
	m = size(Q, 1)

	# number of layers of rotations
	k = size(rotations, 1)

	# startup
	@inbounds for j = 1:2:k-1
	# fuse 4 givens rotations
	for i = 1:2:j-2
	l = j - i + 1

	# Load and apply rotation in (i, l)
	c1, s1 = rotations[i , l - 1]
	c2, s2 = rotations[i , l - 0]
	c3, s3 = rotations[i + 1, l - 2]
	c4, s4 = rotations[i + 1, l - 1]
	rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
	end

	# take the last hanging givens rotation
	c, s = rotations[j, 1]
	rmul!(Q, Rotation2(c, s, 1))
	end

	# pipeline
	@inbounds for j = k + 1 : 2 : n - 1
	# fuse 4 givens rotations
	for i = 1:2:k-1
	l = j - i + 1
	c1, s1 = rotations[i , l - 1]
	c2, s2 = rotations[i , l - 0]
	c3, s3 = rotations[i + 1, l - 2]
	c4, s4 = rotations[i + 1, l - 1]
	rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
	end
	end

	# shutdown
	@inbounds for j = 1:2:k-1
	# hanging rotation
	c, s = rotations[j + 1, n - 1]
	rmul!(Q, Rotation2(c, s, n - 1))

	# fuse 4 givens rotations
	for i = j + 2 : 2 : k - 1
	l = n + j - i + 1
	c1, s1 = rotations[i , l - 1]
	c2, s2 = rotations[i , l - 0]
	c3, s3 = rotations[i + 1, l - 2]
	c4, s4 = rotations[i + 1, l - 1]
	rmul!(Q, Fused2x2(c1, s1, c2, s2, c3, s3, c4, s4, l - 2))
	end
	end
	end