Skip to content

Instantly share code, notes, and snippets.

@mratsim
Created November 13, 2018 16:56
Show Gist options
  • Save mratsim/dae70aa151a414c60ccaf45f3ffb496b to your computer and use it in GitHub Desktop.
Save mratsim/dae70aa151a414c60ccaf45f3ffb496b to your computer and use it in GitHub Desktop.
# Laser
# Copyright (c) 2018 Mamy André-Ratsimbazafy
# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
# This file may not be copied, modified, or distributed except according to those terms.
# Generic microkernel for matrix multiplication
import
./laser_gemm_tiling, ./laser_gemm_matrix, ./laser_gemm_utils,
../../../laser/[cpuinfo, compiler_optim_hints],
macros
# TODO: vzeroupper for AVX version.
withCompilerOptimHints()
# ########################
# Vector helpers
template load[T](vecsize: static int, packedB: ptr UncheckedArray[T]): typed =
when vecsize == 32:
when T is float32: mm256_load_ps(packedB[0].addr)
# else: mm256_load_pd(packedB[0].addr)
elif vecsize == 16:
when T is float32: mm_load_ps(packedB[0].addr)
# else: mm_load_pd(packedB[0].addr)
template setzero(vecsize: static int, T: typedesc): typed =
when vecsize == 32:
when T is float32: mm256_setzero_ps()
# else: mm256_setzero_pd()
elif vecsize == 16:
when T is float32: mm_setzero_ps()
# else: mm_setzero_pd()
template set1[T](vecsize: static int, value: T): typed =
when vecsize == 32:
when T is float32: mm256_set1_ps(value)
# else: mm256_setzero_pd(value)
elif vecsize == 16:
when T is float32: mm_set1_ps(value)
# else: mm_set_pd(value)
func fma[T](simd: CPUFeatureX86, a, b, c: T): T {.inline.}=
when T is m256:
when simd in {x86_AVX2 or x86_AVX512}: mm256_fmadd_ps(a, b, c)
else: mm256_add_ps(mm256_mul_ps(a, b), c)
elif T is m256d:
when simd in {x86_AVX2 or x86_AVX512}: mm256_fmadd_pd(a, b, c)
# else: mm256_add_ps(mm256_mul_pd(a, b), c)
when T is m128:
when simd in {x86_AVX2 or x86_AVX512}: mm_fmadd_ps(a, b, c)
else: mm_add_ps(mm_mul_ps(a, b), c)
elif T is m128d:
when simd in {x86_AVX2 or x86_AVX512}: mm_fmadd_pd(a, b, c)
# else: mm_add_pd(mm_mul_pd(a, b), c)
# ########################
# ########################
# Epilogue
#
# Cases
# 1. C *= β, starting default
# 2. C = AB, if β = 0 and α = 1
# 3. C = αAB, if β = 0 and α = 1
# 4. C += AB, if α = 1
# 5. C += αAB, if α = 1
#
# TODO: Fused operations like relu/sigmoid/tanh
# should be done here as well
proc gebb_ukernel_epilogue*[MR, NR: static int, T](
alpha: T, AB: array[MR, array[NR, T]],
beta: T, vC: MatrixView[T]
) =
let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB.unsafeAddr)
# Beta always = 1 after the first pass on the current C micro-tile
# so even if beta = 1 we need to accumulate with `+=`
if beta == 0.T:
for i in 0 ..< MR:
for j in `||`(0, NR-1, "simd"):
vC[i, j] = 0.T
elif beta != 1.T: # C *= β
for i in 0 ..< MR:
for j in `||`(0, NR-1, "simd"):
vC[i, j] *= beta
if alpha == 1.T: # C += AB
for i in 0 ..< MR:
for j in `||`(0, NR-1, "simd"):
vC[i, j] += pAB[i][j]
else: # C += αAB
for i in 0 ..< MR:
for j in `||`(0, NR-1, "simd"):
vC[i, j] += alpha * pAB[i][j]
# TODO: Fused operations like relu/sigmoid/tanh
# should be done here as well
func gebb_ukernel_edge_epilogue*[MR, NR: static int, T](
alpha: T, AB: array[MR, array[NR, T]],
beta: T, vC: MatrixView[T],
mr, nr: int # Tail to process
) =
let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB.unsafeAddr)
if beta == 0.T:
if alpha == 1.T: # C = AB
for i in 0 ..< mr:
for j in 0 ..< nr:
vC[i, j] = pAB[i][j]
else: # C = αAB
for i in 0 ..< mr:
for j in 0 ..< nr:
vC[i, j] = alpha * pAB[i][j]
else: # C *= β
for i in 0 ..< mr:
for j in 0 ..< nr:
vC[i, j] *= beta
if alpha == 1.T: # C += AB
for i in 0 ..< mr:
for j in 0 ..< nr:
vC[i, j] += pAB[i][j]
else: # C += αAB
for i in 0 ..< mr:
for j in 0 ..< nr:
vC[i, j] += alpha * pAB[i][j]
# TODO: Fused operations like relu/sigmoid/tanh
# should be done here as well
# macro unroll_ukernel[MR, NR: static int, T](
# AB: array[MR, array[NR, T]],
# A, B: ptr
# ): untyped =
# result = newStmtList()
# for i in 0 .. MR - 1:
# for j in 0 .. NR - 1:
# result.add quote do:
# `AB`[`i`][`j`] += `A`[`i`] * `B`[`j`]
import typetraits
template ukernel_impl(){.dirty.} =
const
MR = ukernel.extract_mr()
NR = ukernel.extract_nr()
vecsize = ukernel.extract_vecsize()
simd = ukernel.extract_cpu_simd()
type VecT = vector(ukernel, T)
const L = NR*sizeof(T) div vecsize
const NbElems = vecsize div T.sizeof
var AB{.align_variable.}: array[MR, array[L, VecT]]
var A {.restrict.} = assume_aligned packedA # [kc, mc] by chunks of mr
var B {.restrict.} = assume_aligned packedB # [kc, nc] by chunks of nr
let B0 = vecsize.load(B)
let B1 = vecsize.load(B + NbElems)
# static: assert MR mod 2 == 0, "mr should be a multiple of 2"
for k in 0 ..< kc:
# TODO prefetch
for i in countup(0, MR-1, 2):
let A0 = vecsize.set1(A)
let A1 = vecsize.set1(A+1)
AB[i ][0] = simd.fma(A0, B0, AB[i ][0])
AB[i+1][0] = simd.fma(A1, B0, AB[i+1][0])
AB[i ][1] = simd.fma(A0, B1, AB[i ][1])
AB[i+1][1] = simd.fma(A1, B1, AB[i+1][1])
A += MR
proc gebb_ukernel*[T; ukernel: static MicroKernel](
kc: int,
alpha: T, packedA, packedB: ptr UncheckedArray[T],
beta: T, vC: MatrixView[T]
) =
ukernel_impl()
gebb_ukernel_epilogue(
alpha, cast[array[MR, array[NR, T]]](AB),
beta, vC)
proc gebb_ukernel_edge*[T; ukernel: static MicroKernel](
mr, nr, kc: int,
alpha: T, packedA, packedB: ptr UncheckedArray[T],
beta: T, vC: MatrixView[T]
) =
ukernel_impl()
gebb_ukernel_edge_epilogue(
alpha, cast[array[MR, array[NR, T]]](AB),
beta, vC, mr, nr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment