Created
November 13, 2018 16:56
-
-
Save mratsim/dae70aa151a414c60ccaf45f3ffb496b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Laser | |
# Copyright (c) 2018 Mamy André-Ratsimbazafy | |
# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0). | |
# This file may not be copied, modified, or distributed except according to those terms. | |
# Generic microkernel for matrix multiplication | |
import | |
./laser_gemm_tiling, ./laser_gemm_matrix, ./laser_gemm_utils, | |
../../../laser/[cpuinfo, compiler_optim_hints], | |
macros | |
# TODO: vzeroupper for AVX version. | |
withCompilerOptimHints() | |
# ######################## | |
# Vector helpers | |
template load[T](vecsize: static int, packedB: ptr UncheckedArray[T]): typed = | |
when vecsize == 32: | |
when T is float32: mm256_load_ps(packedB[0].addr) | |
# else: mm256_load_pd(packedB[0].addr) | |
elif vecsize == 16: | |
when T is float32: mm_load_ps(packedB[0].addr) | |
# else: mm_load_pd(packedB[0].addr) | |
template setzero(vecsize: static int, T: typedesc): typed = | |
when vecsize == 32: | |
when T is float32: mm256_setzero_ps() | |
# else: mm256_setzero_pd() | |
elif vecsize == 16: | |
when T is float32: mm_setzero_ps() | |
# else: mm_setzero_pd() | |
template set1[T](vecsize: static int, value: T): typed = | |
when vecsize == 32: | |
when T is float32: mm256_set1_ps(value) | |
# else: mm256_setzero_pd(value) | |
elif vecsize == 16: | |
when T is float32: mm_set1_ps(value) | |
# else: mm_set_pd(value) | |
func fma[T](simd: CPUFeatureX86, a, b, c: T): T {.inline.}= | |
when T is m256: | |
when simd in {x86_AVX2 or x86_AVX512}: mm256_fmadd_ps(a, b, c) | |
else: mm256_add_ps(mm256_mul_ps(a, b), c) | |
elif T is m256d: | |
when simd in {x86_AVX2 or x86_AVX512}: mm256_fmadd_pd(a, b, c) | |
# else: mm256_add_ps(mm256_mul_pd(a, b), c) | |
when T is m128: | |
when simd in {x86_AVX2 or x86_AVX512}: mm_fmadd_ps(a, b, c) | |
else: mm_add_ps(mm_mul_ps(a, b), c) | |
elif T is m128d: | |
when simd in {x86_AVX2 or x86_AVX512}: mm_fmadd_pd(a, b, c) | |
# else: mm_add_pd(mm_mul_pd(a, b), c) | |
# ######################## | |
# ######################## | |
# Epilogue | |
# | |
# Cases | |
# 1. C *= β, starting default | |
# 2. C = AB, if β = 0 and α = 1 | |
# 3. C = αAB, if β = 0 and α = 1 | |
# 4. C += AB, if α = 1 | |
# 5. C += αAB, if α = 1 | |
# | |
# TODO: Fused operations like relu/sigmoid/tanh | |
# should be done here as well | |
proc gebb_ukernel_epilogue*[MR, NR: static int, T]( | |
alpha: T, AB: array[MR, array[NR, T]], | |
beta: T, vC: MatrixView[T] | |
) = | |
let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB.unsafeAddr) | |
# Beta always = 1 after the first pass on the current C micro-tile | |
# so even if beta = 1 we need to accumulate with `+=` | |
if beta == 0.T: | |
for i in 0 ..< MR: | |
for j in `||`(0, NR-1, "simd"): | |
vC[i, j] = 0.T | |
elif beta != 1.T: # C *= β | |
for i in 0 ..< MR: | |
for j in `||`(0, NR-1, "simd"): | |
vC[i, j] *= beta | |
if alpha == 1.T: # C += AB | |
for i in 0 ..< MR: | |
for j in `||`(0, NR-1, "simd"): | |
vC[i, j] += pAB[i][j] | |
else: # C += αAB | |
for i in 0 ..< MR: | |
for j in `||`(0, NR-1, "simd"): | |
vC[i, j] += alpha * pAB[i][j] | |
# TODO: Fused operations like relu/sigmoid/tanh | |
# should be done here as well | |
func gebb_ukernel_edge_epilogue*[MR, NR: static int, T]( | |
alpha: T, AB: array[MR, array[NR, T]], | |
beta: T, vC: MatrixView[T], | |
mr, nr: int # Tail to process | |
) = | |
let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB.unsafeAddr) | |
if beta == 0.T: | |
if alpha == 1.T: # C = AB | |
for i in 0 ..< mr: | |
for j in 0 ..< nr: | |
vC[i, j] = pAB[i][j] | |
else: # C = αAB | |
for i in 0 ..< mr: | |
for j in 0 ..< nr: | |
vC[i, j] = alpha * pAB[i][j] | |
else: # C *= β | |
for i in 0 ..< mr: | |
for j in 0 ..< nr: | |
vC[i, j] *= beta | |
if alpha == 1.T: # C += AB | |
for i in 0 ..< mr: | |
for j in 0 ..< nr: | |
vC[i, j] += pAB[i][j] | |
else: # C += αAB | |
for i in 0 ..< mr: | |
for j in 0 ..< nr: | |
vC[i, j] += alpha * pAB[i][j] | |
# TODO: Fused operations like relu/sigmoid/tanh | |
# should be done here as well | |
# macro unroll_ukernel[MR, NR: static int, T]( | |
# AB: array[MR, array[NR, T]], | |
# A, B: ptr | |
# ): untyped = | |
# result = newStmtList() | |
# for i in 0 .. MR - 1: | |
# for j in 0 .. NR - 1: | |
# result.add quote do: | |
# `AB`[`i`][`j`] += `A`[`i`] * `B`[`j`] | |
import typetraits | |
template ukernel_impl(){.dirty.} = | |
const | |
MR = ukernel.extract_mr() | |
NR = ukernel.extract_nr() | |
vecsize = ukernel.extract_vecsize() | |
simd = ukernel.extract_cpu_simd() | |
type VecT = vector(ukernel, T) | |
const L = NR*sizeof(T) div vecsize | |
const NbElems = vecsize div T.sizeof | |
var AB{.align_variable.}: array[MR, array[L, VecT]] | |
var A {.restrict.} = assume_aligned packedA # [kc, mc] by chunks of mr | |
var B {.restrict.} = assume_aligned packedB # [kc, nc] by chunks of nr | |
let B0 = vecsize.load(B) | |
let B1 = vecsize.load(B + NbElems) | |
# static: assert MR mod 2 == 0, "mr should be a multiple of 2" | |
for k in 0 ..< kc: | |
# TODO prefetch | |
for i in countup(0, MR-1, 2): | |
let A0 = vecsize.set1(A) | |
let A1 = vecsize.set1(A+1) | |
AB[i ][0] = simd.fma(A0, B0, AB[i ][0]) | |
AB[i+1][0] = simd.fma(A1, B0, AB[i+1][0]) | |
AB[i ][1] = simd.fma(A0, B1, AB[i ][1]) | |
AB[i+1][1] = simd.fma(A1, B1, AB[i+1][1]) | |
A += MR | |
proc gebb_ukernel*[T; ukernel: static MicroKernel]( | |
kc: int, | |
alpha: T, packedA, packedB: ptr UncheckedArray[T], | |
beta: T, vC: MatrixView[T] | |
) = | |
ukernel_impl() | |
gebb_ukernel_epilogue( | |
alpha, cast[array[MR, array[NR, T]]](AB), | |
beta, vC) | |
proc gebb_ukernel_edge*[T; ukernel: static MicroKernel]( | |
mr, nr, kc: int, | |
alpha: T, packedA, packedB: ptr UncheckedArray[T], | |
beta: T, vC: MatrixView[T] | |
) = | |
ukernel_impl() | |
gebb_ukernel_edge_epilogue( | |
alpha, cast[array[MR, array[NR, T]]](AB), | |
beta, vC, mr, nr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment