Created
May 15, 2015 21:25
-
-
Save andreasnoack/914e9d0102347cdd0766 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| module LC | |
| const MC = 384 | |
| const KC = 384 | |
| const NC = 4096 | |
| const MR = 4 | |
| const NR = 4 | |
| const _A = Array(Float64, MC*KC) | |
| const _B = Array(Float64, KC*NC) | |
| const _C = Array(Float64, MR*NR) | |
| const _AB = Array(Float64, MR*NR) | |
| const pAB = pointer(_AB) | |
| p_A = pointer(_A) | |
| p_B = pointer(_B) | |
| p_C = pointer(_C) | |
| function pack_MRxk(k::Integer, A::Array{Float64}, Aoffset::Integer, incRowA::Integer, | |
| incColA::Integer, buffer::Array{Float64}, boffset::Integer) | |
| @inbounds begin | |
| for j = 1:k | |
| @simd for i = 1:MR | |
| buffer[boffset + i] = A[Aoffset + (i - 1)*incRowA + 1] | |
| end | |
| boffset += MR | |
| Aoffset += incColA | |
| end | |
| end | |
| end | |
| function pack_A(mc::Integer, kc::Integer, A::Array{Float64}, Aoffset::Integer, | |
| incRowA::Integer, incColA::Integer, buffer::Array{Float64}) | |
| @inbounds begin | |
| mp = div(mc, MR) | |
| _mr = mc % MR | |
| boffset = 0 | |
| for i = 1:mp | |
| pack_MRxk(kc, A, Aoffset, incRowA, incColA, buffer, boffset) | |
| boffset += kc*MR | |
| Aoffset += MR*incRowA | |
| end | |
| if _mr > 0 | |
| for j = 1:kc | |
| for i = 1:_mr | |
| buffer[boffset + i] = A[Aoffset + (i - 1)*incRowA + 1] | |
| end | |
| for i = _mr:MR | |
| buffer[boffset + i] = 0.0 | |
| end | |
| boffset += MR | |
| Aoffset += incColA | |
| end | |
| end | |
| end | |
| end | |
| function pack_kxNR(k::Integer, B::Array{Float64}, Boffset::Integer, incRowB::Integer, | |
| incColB::Integer, buffer::Array{Float64}, boffset::Integer) | |
| @inbounds begin | |
| for i = 1:k | |
| for j = 1:NR | |
| buffer[boffset + j] = B[Boffset + (j - 1)*incColB + 1] | |
| end | |
| boffset += NR | |
| Boffset += incRowB | |
| end | |
| end | |
| end | |
| function pack_B(kc::Integer, nc::Integer, B::Array{Float64}, Boffset::Integer, incRowB::Integer, | |
| incColB::Integer, buffer::Array{Float64}) | |
| @inbounds begin | |
| np = div(nc, NR) | |
| _nr = nc % NR | |
| boffset = 0 | |
| for j = 1:np | |
| pack_kxNR(kc, B, Boffset, incRowB, incColB, buffer, boffset) | |
| boffset += kc*NR | |
| Boffset += NR*incColB | |
| end | |
| if _nr > 0 | |
| for i = 1:kc | |
| for j = 1:_nr | |
| buffer[boffset + j] = B[Boffset + (j - 1)*incColB + 1] | |
| end | |
| for j = _nr + 1:NR | |
| buffer[boffset + j] = 0.0 | |
| end | |
| boffset += NR | |
| Boffset += incRowB | |
| end | |
| end | |
| end | |
| end | |
| const cl = "~{esi},~{rax},~{rbx},~{rcx}"*string([",~{xmm$i}" for i = 0:15]...) | |
| const asms = | |
| "\"movl \$0, %esi \n\t"* | |
| "movq \$1, %rax \n\t"* | |
| "movq \$2, %rbx \n\t"* | |
| "movq \$3, %rcx \n\t"* | |
| "movapd (%rax), %xmm0 \n\t"* | |
| "movapd 16(%rax), %xmm1 \n\t"* | |
| "movapd (%rbx), %xmm2 \n\t"* | |
| "xorpd %xmm8, %xmm8 \n\t"* | |
| "xorpd %xmm9, %xmm9 \n\t"* | |
| "xorpd %xmm10, %xmm10 \n\t"* | |
| "xorpd %xmm11, %xmm11 \n\t"* | |
| "xorpd %xmm12, %xmm12 \n\t"* | |
| "xorpd %xmm13, %xmm13 \n\t"* | |
| "xorpd %xmm14, %xmm14 \n\t"* | |
| "xorpd %xmm15, %xmm15 \n\t"* | |
| "xorpd %xmm3, %xmm3 \n\t"* | |
| "xorpd %xmm4, %xmm4 \n\t"* | |
| "xorpd %xmm5, %xmm5 \n\t"* | |
| "xorpd %xmm6, %xmm6 \n\t"* | |
| "xorpd %xmm7, %xmm7 \n\t"* | |
| "testl %esi, %esi \n\t"* | |
| "je .DWRITEBACK \n\t"* | |
| ".DLOOP: \n\t"* | |
| "addpd %xmm3, %xmm12 \n\t"* | |
| "movapd 16(%rbx), %xmm3 \n\t"* | |
| "addpd %xmm6, %xmm13 \n\t"* | |
| "movapd %xmm2, %xmm6 \n\t"* | |
| "pshufd \$\$78,%xmm2,%xmm4 \n\t"* | |
| "mulpd %xmm0, %xmm2 \n\t"* | |
| "mulpd %xmm1, %xmm6 \n\t"* | |
| "addpd %xmm5, %xmm14 \n\t"* | |
| "addpd %xmm7, %xmm15 \n\t"* | |
| "movapd %xmm4, %xmm7 \n\t"* | |
| "mulpd %xmm0, %xmm4 \n\t"* | |
| "mulpd %xmm1, %xmm7 \n\t"* | |
| "addpd %xmm2, %xmm8 \n\t"* | |
| "movapd 32(%rbx), %xmm2 \n\t"* | |
| "addpd %xmm6, %xmm9 \n\t"* | |
| "movapd %xmm3, %xmm6 \n\t"* | |
| "pshufd \$\$78,%xmm3, %xmm5 \n\t"* | |
| "mulpd %xmm0, %xmm3 \n\t"* | |
| "mulpd %xmm1, %xmm6 \n\t"* | |
| "addpd %xmm4, %xmm10 \n\t"* | |
| "addpd %xmm7, %xmm11 \n\t"* | |
| "movapd %xmm5, %xmm7 \n\t"* | |
| "mulpd %xmm0, %xmm5 \n\t"* | |
| "movapd 32(%rax), %xmm0 \n\t"* | |
| "mulpd %xmm1, %xmm7 \n\t"* | |
| "movapd 48(%rax), %xmm1 \n\t"* | |
| "addq \$\$32, %rax \n\t"* | |
| "addq \$\$32, %rbx \n\t"* | |
| "decl %esi \n\t"* | |
| "jne .DLOOP \n\t"* | |
| "addpd %xmm3, %xmm12 \n\t"* | |
| "addpd %xmm6, %xmm13 \n\t"* | |
| "addpd %xmm5, %xmm14 \n\t"* | |
| "addpd %xmm7, %xmm15 \n\t"* | |
| ".DWRITEBACK: \n\t"* | |
| "movlpd %xmm8, (%rcx) \n\t"* | |
| "movhpd %xmm10, 8(%rcx) \n\t"* | |
| "movlpd %xmm9, 16(%rcx) \n\t"* | |
| "movhpd %xmm11, 24(%rcx) \n\t"* | |
| "addq \$\$32, %rcx \n\t"* | |
| "movlpd %xmm10, (%rcx) \n\t"* | |
| "movhpd %xmm8, 8(%rcx) \n\t"* | |
| "movlpd %xmm11, 16(%rcx) \n\t"* | |
| "movhpd %xmm9, 24(%rcx) \n\t"* | |
| "addq \$\$32, %rcx \n\t"* | |
| "movlpd %xmm12, (%rcx) \n\t"* | |
| "movhpd %xmm14, 8(%rcx) \n\t"* | |
| "movlpd %xmm13, 16(%rcx) \n\t"* | |
| "movhpd %xmm15, 24(%rcx) \n\t"* | |
| "addq \$\$32, %rcx \n\t"* | |
| "movlpd %xmm14, (%rcx) \n\t"* | |
| "movhpd %xmm12, 8(%rcx) \n\t"* | |
| "movlpd %xmm15, 16(%rcx) \n\t"* | |
| "movhpd %xmm13, 24(%rcx) \n\t\", \"r,r,r,r,$cl\"(i32 %0, double* %1, double* %2, double* %3)" | |
| @inline function gemm_micro_kernel(kc::Integer, α::Float64, A::Array{Float64}, Aoffset::Integer, | |
| B::Array{Float64}, Boffset::Integer, β::Float64, C::Array{Float64}, Coffset::Integer, | |
| incRowC::Integer, incColC::Integer) | |
| ccall(:jl_breakpoint, Any, (), ) | |
| Base.llvmcall(""" | |
| call void asm $asms | |
| ret void""", | |
| Void, | |
| Tuple{Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}}, | |
| Cint(kc), pointer(_A, Aoffset + 1), pointer(_B, Boffset + 1), pointer(_AB)) | |
| # Update C <- beta*C | |
| if β == 0.0 | |
| for j = 1:NR | |
| for i =1:MR | |
| C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] = 0.0 | |
| end | |
| end | |
| elseif β != 1.0 | |
| for j = 1:NR | |
| for i = 1:MR | |
| C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] *= β | |
| end | |
| end | |
| end | |
| # Update C <- C + alpha*AB (note: the case alpha==0.0 was already treated in | |
| # the above layer dgemm_nn) | |
| if α == 1.0 | |
| for j = 1:NR | |
| for i = 1:MR | |
| C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] += _AB[i + (j - 1)*MR] | |
| end | |
| end | |
| else | |
| for j = 1:NR | |
| for i = 1:MR | |
| C[Coffset + (i - 1)*incRowC + (j - 1)*incColC + 1] += α*_AB[i + (j - 1)*MR] | |
| end | |
| end | |
| end | |
| end | |
| function geaxpy(m::Integer, n::Integer, α::Float64, X::Array{Float64}, incRowX::Integer, | |
| incColX::Integer, Y::Array{Float64}, Yoffset::Integer, incRowY::Integer, incColY::Integer) | |
| if α != 1.0 | |
| for j = 1:n | |
| for i = 1:m | |
| Y[Yoffset + (i - 1)*incRowY + (j - 1)*incColY + 1] = Y[(i - 1)*incRowY + (j - 1)*incColY + 1] + α*X[(i - 1)*incRowX + (j - 1)*incColX + 1] | |
| end | |
| end | |
| else | |
| for j = 1:n | |
| for i = 1:m | |
| Y[Yoffset + (i - 1)*incRowY + (j - 1)*incColY + 1] = Y[(i - 1)*incRowY + (j - 1)*incColY + 1] + X[(i - 1)*incRowX + (j - 1)*incColX + 1] | |
| end | |
| end | |
| end | |
| end | |
| function gescal(m::Integer, | |
| n::Integer, | |
| α::Float64, | |
| X::Array{Float64}, | |
| Xoffset::Integer, | |
| incRowX::Integer, | |
| incColX::Integer) | |
| if α != 0.0 | |
| for j = 1:n | |
| for i = 1:m | |
| X[Xoffset + (i - 1)*incRowX + (j - 1)*incColX + 1] *= α | |
| end | |
| end | |
| else | |
| for j = 1:n | |
| for i = 1:m | |
| X[Xoffset + (i - 1)*incRowX + (j - 1)*incColX + 1] = 0.0 | |
| end | |
| end | |
| end | |
| end | |
| function gemm_macro_kernel(mc::Integer, nc::Integer, kc::Integer, α::Float64, | |
| β::Float64, C::Array{Float64}, Coffset::Integer, incRowC::Integer, incColC::Integer) | |
| mp = div(mc + MR - 1, MR) | |
| np = div(nc + NR - 1, NR) | |
| _mr = mc % MR | |
| _nr = nc % NR | |
| for j = 1:np | |
| nr = (j != np || _nr == 0) ? NR : _nr | |
| for i = 1:mp | |
| mr = (i != mp || _mr == 0) ? MR : _mr | |
| if mr == MR && nr == NR | |
| gemm_micro_kernel(kc, | |
| α, | |
| _A, | |
| (i - 1)*kc*MR, | |
| _B, | |
| (j - 1)*kc*NR, | |
| β, | |
| C, | |
| Coffset + (i - 1)*MR*incRowC + (j - 1)*NR*incColC, | |
| incRowC, | |
| incColC) | |
| else | |
| gemm_micro_kernel(kc, | |
| α, | |
| _A, | |
| (i - 1)*kc*MR, | |
| _B, | |
| (j - 1)*kc*NR, | |
| 0.0, | |
| C, | |
| Coffset, | |
| 1, | |
| MR) | |
| gescal(mr, nr, β, C, Coffset + (i - 1)*MR*incRowC + (j - 1)*NR*incColC, incRowC, incColC) | |
| geaxpy(mr, nr, 1.0, _C, 1, MR, C, Coffset + (i - 1)*MR*incRowC + (j - 1)*NR*incColC, incRowC, incColC) | |
| end | |
| end | |
| end | |
| return C | |
| end | |
| function gemm_nn(m::Integer, | |
| n::Integer, | |
| k::Integer, | |
| α::Float64, | |
| A::Matrix{Float64}, | |
| incRowA::Integer, | |
| incColA::Integer, | |
| B::Matrix{Float64}, | |
| incRowB::Integer, | |
| incColB::Integer, | |
| β::Float64, | |
| C::Matrix{Float64}, | |
| incRowC::Integer, | |
| incColC::Integer) | |
| mb = div(m + MC - 1, MC) | |
| nb = div(n + NC - 1, NC) | |
| kb = div(k + KC - 1, KC) | |
| _mc = m % MC | |
| _nc = n % NC | |
| _kc = k % KC | |
| if α == 0.0 || k == 0 | |
| gescal(m, n, β, C, incRowC, incColC) | |
| return C | |
| end | |
| for j = 1:nb | |
| nc = (j != nb || _nc == 0) ? NC : _nc | |
| for l = 1:kb | |
| kc = (l != kb || _kc == 0) ? KC : _kc | |
| _β = l == 1 ? β : 1.0 | |
| pack_B(kc, nc, B, (l - 1)*KC*incRowB + (j - 1)*NC*incColB, incRowB, incColB, _B) | |
| for i = 1:mb | |
| mc = (i != mb || _mc == 0) ? MC : _mc | |
| pack_A(mc, kc, A, (i - 1)*MC*incRowA + (l - 1)*KC*incColA, incRowA, incColA, _A) | |
| gemm_macro_kernel(mc, nc, kc, α, _β, C, (i - 1)*MC*incRowC + (j - 1)*NC*incColC, incRowC, incColC) | |
| end | |
| end | |
| end | |
| return C | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment