Created
January 11, 2022 11:47
-
-
Save mchristianl/6a7664dc792a7db3183e3266f64d93d9 to your computer and use it in GitHub Desktop.
Packing and unpacking bits into Vec{8,Bool} for Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using SIMD, BenchmarkTools | |
# for bit interleaving | |
# interleavemask(::Type{T},N,M) where T = reduce(|,(T(iszero(n & (1 << N))) << n for n in 0:sizeof(T)*8-1)) | |
# interleavemasks(::Type{T}) where T = ((interleavemask(T,N) for N in 0:trailing_zeros(sizeof(T))+2)...,) | |
# a slow function to compute periodic bit patterns for the use as constants | |
interleavemask(::Type{T},period,amount,shift,start=1,stop=sizeof(T)*8) where T = | |
reduce(|,T(((n + (shift > 0 ? period-shift : -shift)) % period) < amount) << n for n in start-1:stop-1) | |
# period,amount,shift = (9,1,4) | |
# [ (n,"($n-$shift)%$period<$amount",(((max(n + shift,0)) % period)) < amount) for n in 0:sizeof(UInt64)*8-1 ] | |
# bitstring(interleavemask(UInt64,9,1,-0)) | |
# x0 = reinterpret(UInt64,Vec(true,true,true,true,true,true,true,true)) | |
# bitstring(x0) | |
# x1 = (x0 | (x0 >> (8-1))) & interleavemask(UInt64,16,2,0) | |
# bitstring(x1) | |
# x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt64,32,4,0) | |
# bitstring(x2) | |
# x3 = (x2 | (x2 >> (32-4))) & interleavemask(UInt64,64,8,0) | |
# bitstring(x3) | |
# unsafe_trunc(UInt8,x3) | |
# https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/ | |
function bitpack1(x::Vec{8,Bool}) | |
x0 = reinterpret(UInt64,x) | |
x1 = (x0 | (x0 >> ( 8-1))) & interleavemask(UInt64,16,2,0) # 0x0003000300030003 | |
x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt64,32,4,0) # 0x0000000f0000000f | |
x3 = (x2 | (x2 >> (32-4))) & interleavemask(UInt64,64,8,0) # 0x00000000000000ff | |
unsafe_trunc(UInt8,x3) | |
end | |
# bitpack1 | |
# movq (%rdi), %rax | |
# movq %rax, %rcx | |
# shrq $7, %rcx | |
# orq %rax, %rcx | |
# movabsq $844437815230467, %rax # imm = 0x3000300030003 | |
# andq %rcx, %rax | |
# movq %rax, %rcx | |
# shrq $14, %rcx | |
# orq %rax, %rcx | |
# movq %rcx, %rax | |
# shrq $28, %rax | |
# orl %ecx, %eax | |
# ret | |
function bitpack1(x::Vec{4,Bool}) | |
x0 = reinterpret(UInt32,x) | |
x1 = (x0 | (x0 >> ( 8-1))) & interleavemask(UInt32,16,2,0) # 0x00030003 | |
x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt32,32,4,0) # 0x0000000f | |
unsafe_trunc(UInt8,x2) | |
end | |
# https://stackoverflow.com/questions/8461126/how-to-create-a-byte-out-of-8-bool-values-and-vice-versa | |
# On newer x86 CPUs with BMI2 there are PEXT and PDEP instructions for this purpose. | |
# https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set#BMI2 | |
# https://stackoverflow.com/questions/14547087/extracting-bits-with-a-single-multiplication | |
function bitpack2_rev(xs::Vec{8,Bool}) | |
x = reinterpret(UInt64,xs) | |
m = interleavemask(UInt64,9,1,0) # 0x8040201008040201 | |
return unsafe_trunc(UInt8,(x*m) >> (64-8)) | |
end | |
# bitpack2_rev | |
# movabsq $-9205322385119247871, %rax # imm = 0x8040201008040201 | |
# imulq (%rdi), %rax | |
# shrq $56, %rax | |
# retq | |
function bitpack2(xs::Vec{8,Bool}) | |
x = reinterpret(UInt64,xs) | |
m = interleavemask(UInt64,7,1,0,2,63) # 0x0102040810204080 | |
return unsafe_trunc(UInt8,(x*m) >> (64-8)) | |
end | |
# bitpack2 | |
# movabsq $72624976668147840, %rax # imm = 0x102040810204080 | |
# imulq (%rdi), %rax | |
# shrq $56, %rax | |
# retq | |
function bitpack3(x::Vec{8,Bool}) # ✗ | |
s = Vec((UInt8(n) for n in 0:7)...) | |
x0 = reinterpret(Vec{8,UInt8},x) | |
x1 = x0 << s | |
reduce(|,x1) | |
end | |
# bitpack3 | |
# vmovq (%rdi), %xmm0 # xmm0 = mem[0],zero | |
# vpmovzxbw %xmm0, %ymm0 # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero | |
# movabsq $.rodata.cst32, %rax | |
# vpmullw (%rax), %ymm0, %ymm0 | |
# movabsq $139838074081344, %rax # imm = 0x7F2E96BB5040 | |
# vpand (%rax), %ymm0, %ymm0 | |
# vextracti128 $1, %ymm0, %xmm1 | |
# vpackuswb %xmm1, %xmm0, %xmm0 | |
# vpshufd $85, %xmm0, %xmm1 # xmm1 = xmm0[1,1,1,1] | |
# vpor %xmm1, %xmm0, %xmm0 | |
# vpsrld $16, %xmm0, %xmm1 | |
# vpor %xmm1, %xmm0, %xmm0 | |
# vpsrlw $8, %xmm0, %xmm1 | |
# vpor %xmm1, %xmm0, %xmm0 | |
# vmovd %xmm0, %eax | |
# vzeroupper | |
# retq | |
function bitunpack1_rev(x::UInt8) | |
m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201 | |
m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080 | |
return reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1)) | |
end | |
# bitunpack1_rev | |
# movl %edi, %eax | |
# movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201 | |
# imulq %rax, %rcx | |
# shrq $7, %rcx | |
# movabsq $72340172838076673, %rax # imm = 0x101010101010101 | |
# andq %rcx, %rax | |
# vmovq %rax, %xmm0 | |
# retq | |
# https://stackoverflow.com/questions/14537831/isolate-specific-row-column-diagonal-from-a-64-bit-number/14539116#14539116 | |
function bitunpack1(x::UInt8) | |
m1 = interleavemask(UInt64,7,1,1) # 0x0204081020408102 | |
m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101 | |
x1 = x >> 1 # remove first bit | |
return reinterpret(Vec{8,Bool},((x1*m1 | x) & m2)) | |
end | |
# bitunpack1 | |
# movl %edi, %eax | |
# movl %edi, %ecx | |
# shrb %cl | |
# movzbl %cl, %ecx | |
# movabsq $145249953336295682, %rdx # imm = 0x204081020408102 | |
# imulq %rcx, %rdx | |
# orq %rdx, %rax | |
# movabsq $72340172838076673, %rcx # imm = 0x101010101010101 | |
# andq %rax, %rcx | |
# vmovq %rcx, %xmm0 | |
# retq | |
function bitunpack2(x::UInt8) | |
m1 = interleavemask(UInt64,7,1,0) # 0x8102040810204081 | |
m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101 | |
x1 = x & (~0b1) # remove first bit | |
return reinterpret(Vec{8,Bool},((x1*m1 | x) & m2)) | |
end | |
# bitunpack2 | |
# movl %edi, %eax | |
# andl $-2, %edi | |
# movabsq $-9150747060186627967, %rcx # imm = 0x8102040810204081 | |
# imulq %rdi, %rcx | |
# orq %rcx, %rax | |
# movabsq $72340172838076673, %rcx # imm = 0x101010101010101 | |
# andq %rax, %rcx | |
# vmovq %rcx, %xmm0 | |
# retq | |
function bitunpack3(x::UInt8) | |
# bit2vec_rev | |
m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201 | |
m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080 | |
v = reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1)) | |
return shufflevector(v,Val((7,6,5,4,3,2,1,0))) | |
end | |
# bitunpack3 | |
# movl %edi, %eax | |
# movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201 | |
# imulq %rax, %rcx | |
# shrq $7, %rcx | |
# movabsq $72340172838076673, %rax # imm = 0x101010101010101 | |
# andq %rcx, %rax | |
# vmovq %rax, %xmm0 | |
# movabsq $.rodata.cst16, %rax | |
# vpshufb (%rax), %xmm0, %xmm0 | |
# retq | |
const bitpack = bitpack2 | |
const bitunpack = bitunpack2 | |
const bitpack_rev = bitpack2_rev | |
const bitunpack_rev = bitunpack1_rev | |
for x in UnitRange(typemin(UInt8),typemax(UInt8)) | |
@assert bitpack( bitunpack( x)) == x | |
@assert bitpack_rev(bitunpack_rev(x)) == x | |
v = bitunpack(x) | |
@assert bitpack1(v) == bitpack2(v) == bitpack3(v) | |
@assert (( reinterpret(UInt64,bitunpack1(x)) | |
== reinterpret(UInt64,bitunpack2(x)) | |
== reinterpret(UInt64,bitunpack3(x)) | |
)) | |
end | |
println("bitpack1" ); code_native(bitpack1 ,(Vec{8,Bool},); debuginfo=:none) | |
println("bitpack2_rev" ); code_native(bitpack2_rev ,(Vec{8,Bool},); debuginfo=:none) | |
println("bitpack2" ); code_native(bitpack2 ,(Vec{8,Bool},); debuginfo=:none) | |
println("bitpack3" ); code_native(bitpack3 ,(Vec{8,Bool},); debuginfo=:none) | |
println("bitunpack1_rev"); code_native(bitunpack1_rev,(UInt8 ,); debuginfo=:none) | |
println("bitunpack1" ); code_native(bitunpack1 ,(UInt8 ,); debuginfo=:none) | |
println("bitunpack2" ); code_native(bitunpack2 ,(UInt8 ,); debuginfo=:none) | |
println("bitunpack3" ); code_native(bitunpack3 ,(UInt8 ,); debuginfo=:none) | |
x = rand(UInt8) | |
v = bitunpack(x) | |
println("bitpack1" ); display(@benchmark bitpack1( $v)) | |
println("bitpack2_rev" ); display(@benchmark bitpack2_rev( $v)) | |
println("bitpack2" ); display(@benchmark bitpack2( $v)) | |
println("bitpack3" ); display(@benchmark bitpack3( $v)) | |
println("bitunpack1_rev"); display(@benchmark bitunpack1_rev($x)) | |
println("bitunpack1" ); display(@benchmark bitunpack1( $x)) | |
println("bitunpack2" ); display(@benchmark bitunpack2( $x)) | |
println("bitunpack3" ); display(@benchmark bitunpack3( $x)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment