Skip to content

Instantly share code, notes, and snippets.

@mchristianl
Created January 11, 2022 11:47
Show Gist options
  • Save mchristianl/6a7664dc792a7db3183e3266f64d93d9 to your computer and use it in GitHub Desktop.
Save mchristianl/6a7664dc792a7db3183e3266f64d93d9 to your computer and use it in GitHub Desktop.
Packing and unpacking bits into Vec{8,Bool} for Julia
using SIMD, BenchmarkTools
# for bit interleaving
# interleavemask(::Type{T},N,M) where T = reduce(|,(T(iszero(n & (1 << N))) << n for n in 0:sizeof(T)*8-1))
# interleavemasks(::Type{T}) where T = ((interleavemask(T,N) for N in 0:trailing_zeros(sizeof(T))+2)...,)
# a slow function to compute periodic bit patterns for the use as constants
interleavemask(::Type{T},period,amount,shift,start=1,stop=sizeof(T)*8) where T =
reduce(|,T(((n + (shift > 0 ? period-shift : -shift)) % period) < amount) << n for n in start-1:stop-1)
# period,amount,shift = (9,1,4)
# [ (n,"($n-$shift)%$period<$amount",(((max(n + shift,0)) % period)) < amount) for n in 0:sizeof(UInt64)*8-1 ]
# bitstring(interleavemask(UInt64,9,1,-0))
# x0 = reinterpret(UInt64,Vec(true,true,true,true,true,true,true,true))
# bitstring(x0)
# x1 = (x0 | (x0 >> (8-1))) & interleavemask(UInt64,16,2,0)
# bitstring(x1)
# x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt64,32,4,0)
# bitstring(x2)
# x3 = (x2 | (x2 >> (32-4))) & interleavemask(UInt64,64,8,0)
# bitstring(x3)
# unsafe_trunc(UInt8,x3)
# https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
function bitpack1(x::Vec{8,Bool})
x0 = reinterpret(UInt64,x)
x1 = (x0 | (x0 >> ( 8-1))) & interleavemask(UInt64,16,2,0) # 0x0003000300030003
x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt64,32,4,0) # 0x0000000f0000000f
x3 = (x2 | (x2 >> (32-4))) & interleavemask(UInt64,64,8,0) # 0x00000000000000ff
unsafe_trunc(UInt8,x3)
end
# bitpack1
# movq (%rdi), %rax
# movq %rax, %rcx
# shrq $7, %rcx
# orq %rax, %rcx
# movabsq $844437815230467, %rax # imm = 0x3000300030003
# andq %rcx, %rax
# movq %rax, %rcx
# shrq $14, %rcx
# orq %rax, %rcx
# movq %rcx, %rax
# shrq $28, %rax
# orl %ecx, %eax
# ret
function bitpack1(x::Vec{4,Bool})
x0 = reinterpret(UInt32,x)
x1 = (x0 | (x0 >> ( 8-1))) & interleavemask(UInt32,16,2,0) # 0x00030003
x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt32,32,4,0) # 0x0000000f
unsafe_trunc(UInt8,x2)
end
# https://stackoverflow.com/questions/8461126/how-to-create-a-byte-out-of-8-bool-values-and-vice-versa
# On newer x86 CPUs with BMI2 there are PEXT and PDEP instructions for this purpose.
# https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set#BMI2
# https://stackoverflow.com/questions/14547087/extracting-bits-with-a-single-multiplication
function bitpack2_rev(xs::Vec{8,Bool})
x = reinterpret(UInt64,xs)
m = interleavemask(UInt64,9,1,0) # 0x8040201008040201
return unsafe_trunc(UInt8,(x*m) >> (64-8))
end
# bitpack2_rev
# movabsq $-9205322385119247871, %rax # imm = 0x8040201008040201
# imulq (%rdi), %rax
# shrq $56, %rax
# retq
function bitpack2(xs::Vec{8,Bool})
x = reinterpret(UInt64,xs)
m = interleavemask(UInt64,7,1,0,2,63) # 0x0102040810204080
return unsafe_trunc(UInt8,(x*m) >> (64-8))
end
# bitpack2
# movabsq $72624976668147840, %rax # imm = 0x102040810204080
# imulq (%rdi), %rax
# shrq $56, %rax
# retq
function bitpack3(x::Vec{8,Bool}) # ✗
s = Vec((UInt8(n) for n in 0:7)...)
x0 = reinterpret(Vec{8,UInt8},x)
x1 = x0 << s
reduce(|,x1)
end
# bitpack3
# vmovq (%rdi), %xmm0 # xmm0 = mem[0],zero
# vpmovzxbw %xmm0, %ymm0 # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
# movabsq $.rodata.cst32, %rax
# vpmullw (%rax), %ymm0, %ymm0
# movabsq $139838074081344, %rax # imm = 0x7F2E96BB5040
# vpand (%rax), %ymm0, %ymm0
# vextracti128 $1, %ymm0, %xmm1
# vpackuswb %xmm1, %xmm0, %xmm0
# vpshufd $85, %xmm0, %xmm1 # xmm1 = xmm0[1,1,1,1]
# vpor %xmm1, %xmm0, %xmm0
# vpsrld $16, %xmm0, %xmm1
# vpor %xmm1, %xmm0, %xmm0
# vpsrlw $8, %xmm0, %xmm1
# vpor %xmm1, %xmm0, %xmm0
# vmovd %xmm0, %eax
# vzeroupper
# retq
function bitunpack1_rev(x::UInt8)
m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201
m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080
return reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1))
end
# bitunpack1_rev
# movl %edi, %eax
# movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201
# imulq %rax, %rcx
# shrq $7, %rcx
# movabsq $72340172838076673, %rax # imm = 0x101010101010101
# andq %rcx, %rax
# vmovq %rax, %xmm0
# retq
# https://stackoverflow.com/questions/14537831/isolate-specific-row-column-diagonal-from-a-64-bit-number/14539116#14539116
function bitunpack1(x::UInt8)
m1 = interleavemask(UInt64,7,1,1) # 0x0204081020408102
m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101
x1 = x >> 1 # remove first bit
return reinterpret(Vec{8,Bool},((x1*m1 | x) & m2))
end
# bitunpack1
# movl %edi, %eax
# movl %edi, %ecx
# shrb %cl
# movzbl %cl, %ecx
# movabsq $145249953336295682, %rdx # imm = 0x204081020408102
# imulq %rcx, %rdx
# orq %rdx, %rax
# movabsq $72340172838076673, %rcx # imm = 0x101010101010101
# andq %rax, %rcx
# vmovq %rcx, %xmm0
# retq
function bitunpack2(x::UInt8)
m1 = interleavemask(UInt64,7,1,0) # 0x8102040810204081
m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101
x1 = x & (~0b1) # remove first bit
return reinterpret(Vec{8,Bool},((x1*m1 | x) & m2))
end
# bitunpack2
# movl %edi, %eax
# andl $-2, %edi
# movabsq $-9150747060186627967, %rcx # imm = 0x8102040810204081
# imulq %rdi, %rcx
# orq %rcx, %rax
# movabsq $72340172838076673, %rcx # imm = 0x101010101010101
# andq %rax, %rcx
# vmovq %rcx, %xmm0
# retq
function bitunpack3(x::UInt8)
# bit2vec_rev
m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201
m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080
v = reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1))
return shufflevector(v,Val((7,6,5,4,3,2,1,0)))
end
# bitunpack3
# movl %edi, %eax
# movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201
# imulq %rax, %rcx
# shrq $7, %rcx
# movabsq $72340172838076673, %rax # imm = 0x101010101010101
# andq %rcx, %rax
# vmovq %rax, %xmm0
# movabsq $.rodata.cst16, %rax
# vpshufb (%rax), %xmm0, %xmm0
# retq
const bitpack = bitpack2
const bitunpack = bitunpack2
const bitpack_rev = bitpack2_rev
const bitunpack_rev = bitunpack1_rev
for x in UnitRange(typemin(UInt8),typemax(UInt8))
@assert bitpack( bitunpack( x)) == x
@assert bitpack_rev(bitunpack_rev(x)) == x
v = bitunpack(x)
@assert bitpack1(v) == bitpack2(v) == bitpack3(v)
@assert (( reinterpret(UInt64,bitunpack1(x))
== reinterpret(UInt64,bitunpack2(x))
== reinterpret(UInt64,bitunpack3(x))
))
end
println("bitpack1" ); code_native(bitpack1 ,(Vec{8,Bool},); debuginfo=:none)
println("bitpack2_rev" ); code_native(bitpack2_rev ,(Vec{8,Bool},); debuginfo=:none)
println("bitpack2" ); code_native(bitpack2 ,(Vec{8,Bool},); debuginfo=:none)
println("bitpack3" ); code_native(bitpack3 ,(Vec{8,Bool},); debuginfo=:none)
println("bitunpack1_rev"); code_native(bitunpack1_rev,(UInt8 ,); debuginfo=:none)
println("bitunpack1" ); code_native(bitunpack1 ,(UInt8 ,); debuginfo=:none)
println("bitunpack2" ); code_native(bitunpack2 ,(UInt8 ,); debuginfo=:none)
println("bitunpack3" ); code_native(bitunpack3 ,(UInt8 ,); debuginfo=:none)
x = rand(UInt8)
v = bitunpack(x)
println("bitpack1" ); display(@benchmark bitpack1( $v))
println("bitpack2_rev" ); display(@benchmark bitpack2_rev( $v))
println("bitpack2" ); display(@benchmark bitpack2( $v))
println("bitpack3" ); display(@benchmark bitpack3( $v))
println("bitunpack1_rev"); display(@benchmark bitunpack1_rev($x))
println("bitunpack1" ); display(@benchmark bitunpack1( $x))
println("bitunpack2" ); display(@benchmark bitunpack2( $x))
println("bitunpack3" ); display(@benchmark bitunpack3( $x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment