mchristianl · January 11, 2022 11:47
diff --git a/bitpacking.jl b/bitpacking.jl
 using SIMD, BenchmarkTools

 # for bit interleaving
 # interleavemask(::Type{T},N,M) where T = reduce(|,(T(iszero(n & (1 << N))) << n for n in 0:sizeof(T)*8-1))
 # interleavemasks(::Type{T}) where T = ((interleavemask(T,N) for N in 0:trailing_zeros(sizeof(T))+2)...,)

 # a slow function to compute periodic bit patterns for the use as constants
 interleavemask(::Type{T},period,amount,shift,start=1,stop=sizeof(T)*8) where T =
  reduce(|,T(((n + (shift > 0 ? period-shift : -shift)) % period) < amount) << n for n in start-1:stop-1)

 # period,amount,shift = (9,1,4)
 # [ (n,"($n-$shift)%$period<$amount",(((max(n + shift,0)) % period)) < amount) for n in 0:sizeof(UInt64)*8-1 ]
 # bitstring(interleavemask(UInt64,9,1,-0))

 # x0 = reinterpret(UInt64,Vec(true,true,true,true,true,true,true,true))
 # bitstring(x0)
 # x1 = (x0 | (x0 >> (8-1))) & interleavemask(UInt64,16,2,0)
 # bitstring(x1)
 # x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt64,32,4,0)
 # bitstring(x2)
 # x3 = (x2 | (x2 >> (32-4))) & interleavemask(UInt64,64,8,0)
 # bitstring(x3)
 # unsafe_trunc(UInt8,x3)

 # https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 function bitpack1(x::Vec{8,Bool})
  x0 = reinterpret(UInt64,x)
  x1 = (x0 | (x0 >> ( 8-1))) & interleavemask(UInt64,16,2,0) # 0x0003000300030003
  x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt64,32,4,0) # 0x0000000f0000000f
  x3 = (x2 | (x2 >> (32-4))) & interleavemask(UInt64,64,8,0) # 0x00000000000000ff
  unsafe_trunc(UInt8,x3)
 end
 # bitpack1
 #   movq    (%rdi), %rax
 #   movq    %rax, %rcx
 #   shrq    $7, %rcx
 #   orq     %rax, %rcx
 #   movabsq $844437815230467, %rax      # imm = 0x3000300030003
 #   andq    %rcx, %rax
 #   movq    %rax, %rcx
 #   shrq    $14, %rcx
 #   orq     %rax, %rcx
 #   movq    %rcx, %rax
 #   shrq    $28, %rax
 #   orl     %ecx, %eax
 #   ret

 function bitpack1(x::Vec{4,Bool})
  x0 = reinterpret(UInt32,x)
  x1 = (x0 | (x0 >> ( 8-1))) & interleavemask(UInt32,16,2,0) # 0x00030003
  x2 = (x1 | (x1 >> (16-2))) & interleavemask(UInt32,32,4,0) # 0x0000000f
  unsafe_trunc(UInt8,x2)
 end

 # https://stackoverflow.com/questions/8461126/how-to-create-a-byte-out-of-8-bool-values-and-vice-versa
 #   On newer x86 CPUs with BMI2 there are PEXT and PDEP instructions for this purpose. 
 #   https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set#BMI2
 # https://stackoverflow.com/questions/14547087/extracting-bits-with-a-single-multiplication
 function bitpack2_rev(xs::Vec{8,Bool})
  x = reinterpret(UInt64,xs)
  m = interleavemask(UInt64,9,1,0) # 0x8040201008040201
  return unsafe_trunc(UInt8,(x*m) >> (64-8))
 end
 # bitpack2_rev
 #   movabsq $-9205322385119247871, %rax # imm = 0x8040201008040201
 #   imulq   (%rdi), %rax
 #   shrq    $56, %rax
 #   retq

 function bitpack2(xs::Vec{8,Bool})
  x = reinterpret(UInt64,xs)
  m = interleavemask(UInt64,7,1,0,2,63) # 0x0102040810204080
  return unsafe_trunc(UInt8,(x*m) >> (64-8))
 end
 # bitpack2
 #   movabsq $72624976668147840, %rax    # imm = 0x102040810204080
 #   imulq   (%rdi), %rax
 #   shrq    $56, %rax
 #   retq

 function bitpack3(x::Vec{8,Bool}) # ✗
  s = Vec((UInt8(n) for n in 0:7)...)
  x0 = reinterpret(Vec{8,UInt8},x)
  x1 = x0 << s
  reduce(|,x1)
 end
 # bitpack3
 #   vmovq   (%rdi), %xmm0               # xmm0 = mem[0],zero
 #   vpmovzxbw       %xmm0, %ymm0        # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 #   movabsq $.rodata.cst32, %rax
 #   vpmullw (%rax), %ymm0, %ymm0
 #   movabsq $139838074081344, %rax      # imm = 0x7F2E96BB5040
 #   vpand   (%rax), %ymm0, %ymm0
 #   vextracti128    $1, %ymm0, %xmm1
 #   vpackuswb       %xmm1, %xmm0, %xmm0
 #   vpshufd $85, %xmm0, %xmm1           # xmm1 = xmm0[1,1,1,1]
 #   vpor    %xmm1, %xmm0, %xmm0
 #   vpsrld  $16, %xmm0, %xmm1
 #   vpor    %xmm1, %xmm0, %xmm0
 #   vpsrlw  $8, %xmm0, %xmm1
 #   vpor    %xmm1, %xmm0, %xmm0
 #   vmovd   %xmm0, %eax
 #   vzeroupper
 #   retq

 function bitunpack1_rev(x::UInt8)
  m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201
  m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080
  return reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1))
 end
 # bitunpack1_rev
 #   movl    %edi, %eax
 #   movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201
 #   imulq   %rax, %rcx
 #   shrq    $7, %rcx
 #   movabsq $72340172838076673, %rax    # imm = 0x101010101010101
 #   andq    %rcx, %rax
 #   vmovq   %rax, %xmm0
 #   retq

 # https://stackoverflow.com/questions/14537831/isolate-specific-row-column-diagonal-from-a-64-bit-number/14539116#14539116
 function bitunpack1(x::UInt8)
  m1 = interleavemask(UInt64,7,1,1) # 0x0204081020408102
  m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101
  x1 = x >> 1 # remove first bit
  return reinterpret(Vec{8,Bool},((x1*m1 | x) & m2))
 end
 # bitunpack1
 #   movl    %edi, %eax
 #   movl    %edi, %ecx
 #   shrb    %cl
 #   movzbl  %cl, %ecx
 #   movabsq $145249953336295682, %rdx   # imm = 0x204081020408102
 #   imulq   %rcx, %rdx
 #   orq     %rdx, %rax
 #   movabsq $72340172838076673, %rcx    # imm = 0x101010101010101
 #   andq    %rax, %rcx
 #   vmovq   %rcx, %xmm0
 #   retq

 function bitunpack2(x::UInt8)
  m1 = interleavemask(UInt64,7,1,0) # 0x8102040810204081
  m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101
  x1 = x & (~0b1) # remove first bit
  return reinterpret(Vec{8,Bool},((x1*m1 | x) & m2))
 end
 # bitunpack2
 #   movl    %edi, %eax
 #   andl    $-2, %edi
 #   movabsq $-9150747060186627967, %rcx # imm = 0x8102040810204081
 #   imulq   %rdi, %rcx
 #   orq     %rcx, %rax
 #   movabsq $72340172838076673, %rcx    # imm = 0x101010101010101
 #   andq    %rax, %rcx
 #   vmovq   %rcx, %xmm0
 #   retq

 function bitunpack3(x::UInt8)
  # bit2vec_rev
  m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201
  m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080
  v =  reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1))
  return shufflevector(v,Val((7,6,5,4,3,2,1,0)))
 end
 # bitunpack3
 #   movl    %edi, %eax
 #   movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201
 #   imulq   %rax, %rcx
 #   shrq    $7, %rcx
 #   movabsq $72340172838076673, %rax    # imm = 0x101010101010101
 #   andq    %rcx, %rax
 #   vmovq   %rax, %xmm0
 #   movabsq $.rodata.cst16, %rax
 #   vpshufb (%rax), %xmm0, %xmm0
 #   retq

 const bitpack       = bitpack2
 const bitunpack     = bitunpack2
 const bitpack_rev   = bitpack2_rev
 const bitunpack_rev = bitunpack1_rev

 for x in UnitRange(typemin(UInt8),typemax(UInt8))
  @assert bitpack(    bitunpack(    x)) == x
  @assert bitpack_rev(bitunpack_rev(x)) == x
  v = bitunpack(x)
  @assert bitpack1(v) == bitpack2(v) == bitpack3(v)
  @assert (( reinterpret(UInt64,bitunpack1(x))
          == reinterpret(UInt64,bitunpack2(x))
          == reinterpret(UInt64,bitunpack3(x))
          ))
 end

 println("bitpack1"      ); code_native(bitpack1      ,(Vec{8,Bool},); debuginfo=:none)
 println("bitpack2_rev"  ); code_native(bitpack2_rev  ,(Vec{8,Bool},); debuginfo=:none)
 println("bitpack2"      ); code_native(bitpack2      ,(Vec{8,Bool},); debuginfo=:none)
 println("bitpack3"      ); code_native(bitpack3      ,(Vec{8,Bool},); debuginfo=:none)
 println("bitunpack1_rev"); code_native(bitunpack1_rev,(UInt8      ,); debuginfo=:none)
 println("bitunpack1"    ); code_native(bitunpack1    ,(UInt8      ,); debuginfo=:none)
 println("bitunpack2"    ); code_native(bitunpack2    ,(UInt8      ,); debuginfo=:none)
 println("bitunpack3"    ); code_native(bitunpack3    ,(UInt8      ,); debuginfo=:none)

 x = rand(UInt8)
 v = bitunpack(x)
 println("bitpack1"      ); display(@benchmark bitpack1(      $v))
 println("bitpack2_rev"  ); display(@benchmark bitpack2_rev(  $v))
 println("bitpack2"      ); display(@benchmark bitpack2(      $v))
 println("bitpack3"      ); display(@benchmark bitpack3(      $v))
 println("bitunpack1_rev"); display(@benchmark bitunpack1_rev($x))
 println("bitunpack1"    ); display(@benchmark bitunpack1(    $x))
 println("bitunpack2"    ); display(@benchmark bitunpack2(    $x))
 println("bitunpack3"    ); display(@benchmark bitunpack3(    $x))
	using SIMD, BenchmarkTools

	# for bit interleaving
	# interleavemask(::Type{T},N,M) where T = reduce(\|,(T(iszero(n & (1 << N))) << n for n in 0:sizeof(T)*8-1))
	# interleavemasks(::Type{T}) where T = ((interleavemask(T,N) for N in 0:trailing_zeros(sizeof(T))+2)...,)

	# a slow function to compute periodic bit patterns for the use as constants
	interleavemask(::Type{T},period,amount,shift,start=1,stop=sizeof(T)*8) where T =
	reduce(\|,T(((n + (shift > 0 ? period-shift : -shift)) % period) < amount) << n for n in start-1:stop-1)

	# period,amount,shift = (9,1,4)
	# [ (n,"($n-$shift)%$period<$amount",(((max(n + shift,0)) % period)) < amount) for n in 0:sizeof(UInt64)*8-1 ]
	# bitstring(interleavemask(UInt64,9,1,-0))

	# x0 = reinterpret(UInt64,Vec(true,true,true,true,true,true,true,true))
	# bitstring(x0)
	# x1 = (x0 \| (x0 >> (8-1))) & interleavemask(UInt64,16,2,0)
	# bitstring(x1)
	# x2 = (x1 \| (x1 >> (16-2))) & interleavemask(UInt64,32,4,0)
	# bitstring(x2)
	# x3 = (x2 \| (x2 >> (32-4))) & interleavemask(UInt64,64,8,0)
	# bitstring(x3)
	# unsafe_trunc(UInt8,x3)

	# https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
	function bitpack1(x::Vec{8,Bool})
	x0 = reinterpret(UInt64,x)
	x1 = (x0 \| (x0 >> ( 8-1))) & interleavemask(UInt64,16,2,0) # 0x0003000300030003
	x2 = (x1 \| (x1 >> (16-2))) & interleavemask(UInt64,32,4,0) # 0x0000000f0000000f
	x3 = (x2 \| (x2 >> (32-4))) & interleavemask(UInt64,64,8,0) # 0x00000000000000ff
	unsafe_trunc(UInt8,x3)
	end
	# bitpack1
	# movq (%rdi), %rax
	# movq %rax, %rcx
	# shrq $7, %rcx
	# orq %rax, %rcx
	# movabsq $844437815230467, %rax # imm = 0x3000300030003
	# andq %rcx, %rax
	# movq %rax, %rcx
	# shrq $14, %rcx
	# orq %rax, %rcx
	# movq %rcx, %rax
	# shrq $28, %rax
	# orl %ecx, %eax
	# ret

	function bitpack1(x::Vec{4,Bool})
	x0 = reinterpret(UInt32,x)
	x1 = (x0 \| (x0 >> ( 8-1))) & interleavemask(UInt32,16,2,0) # 0x00030003
	x2 = (x1 \| (x1 >> (16-2))) & interleavemask(UInt32,32,4,0) # 0x0000000f
	unsafe_trunc(UInt8,x2)
	end

	# https://stackoverflow.com/questions/8461126/how-to-create-a-byte-out-of-8-bool-values-and-vice-versa
	# On newer x86 CPUs with BMI2 there are PEXT and PDEP instructions for this purpose.
	# https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set#BMI2
	# https://stackoverflow.com/questions/14547087/extracting-bits-with-a-single-multiplication
	function bitpack2_rev(xs::Vec{8,Bool})
	x = reinterpret(UInt64,xs)
	m = interleavemask(UInt64,9,1,0) # 0x8040201008040201
	return unsafe_trunc(UInt8,(x*m) >> (64-8))
	end
	# bitpack2_rev
	# movabsq $-9205322385119247871, %rax # imm = 0x8040201008040201
	# imulq (%rdi), %rax
	# shrq $56, %rax
	# retq

	function bitpack2(xs::Vec{8,Bool})
	x = reinterpret(UInt64,xs)
	m = interleavemask(UInt64,7,1,0,2,63) # 0x0102040810204080
	return unsafe_trunc(UInt8,(x*m) >> (64-8))
	end
	# bitpack2
	# movabsq $72624976668147840, %rax # imm = 0x102040810204080
	# imulq (%rdi), %rax
	# shrq $56, %rax
	# retq

	function bitpack3(x::Vec{8,Bool}) # ✗
	s = Vec((UInt8(n) for n in 0:7)...)
	x0 = reinterpret(Vec{8,UInt8},x)
	x1 = x0 << s
	reduce(\|,x1)
	end
	# bitpack3
	# vmovq (%rdi), %xmm0 # xmm0 = mem[0],zero
	# vpmovzxbw %xmm0, %ymm0 # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
	# movabsq $.rodata.cst32, %rax
	# vpmullw (%rax), %ymm0, %ymm0
	# movabsq $139838074081344, %rax # imm = 0x7F2E96BB5040
	# vpand (%rax), %ymm0, %ymm0
	# vextracti128 $1, %ymm0, %xmm1
	# vpackuswb %xmm1, %xmm0, %xmm0
	# vpshufd $85, %xmm0, %xmm1 # xmm1 = xmm0[1,1,1,1]
	# vpor %xmm1, %xmm0, %xmm0
	# vpsrld $16, %xmm0, %xmm1
	# vpor %xmm1, %xmm0, %xmm0
	# vpsrlw $8, %xmm0, %xmm1
	# vpor %xmm1, %xmm0, %xmm0
	# vmovd %xmm0, %eax
	# vzeroupper
	# retq

	function bitunpack1_rev(x::UInt8)
	m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201
	m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080
	return reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1))
	end
	# bitunpack1_rev
	# movl %edi, %eax
	# movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201
	# imulq %rax, %rcx
	# shrq $7, %rcx
	# movabsq $72340172838076673, %rax # imm = 0x101010101010101
	# andq %rcx, %rax
	# vmovq %rax, %xmm0
	# retq

	# https://stackoverflow.com/questions/14537831/isolate-specific-row-column-diagonal-from-a-64-bit-number/14539116#14539116
	function bitunpack1(x::UInt8)
	m1 = interleavemask(UInt64,7,1,1) # 0x0204081020408102
	m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101
	x1 = x >> 1 # remove first bit
	return reinterpret(Vec{8,Bool},((x1*m1 \| x) & m2))
	end
	# bitunpack1
	# movl %edi, %eax
	# movl %edi, %ecx
	# shrb %cl
	# movzbl %cl, %ecx
	# movabsq $145249953336295682, %rdx # imm = 0x204081020408102
	# imulq %rcx, %rdx
	# orq %rdx, %rax
	# movabsq $72340172838076673, %rcx # imm = 0x101010101010101
	# andq %rax, %rcx
	# vmovq %rcx, %xmm0
	# retq

	function bitunpack2(x::UInt8)
	m1 = interleavemask(UInt64,7,1,0) # 0x8102040810204081
	m2 = interleavemask(UInt64,8,1,0) # 0x0101010101010101
	x1 = x & (~0b1) # remove first bit
	return reinterpret(Vec{8,Bool},((x1*m1 \| x) & m2))
	end
	# bitunpack2
	# movl %edi, %eax
	# andl $-2, %edi
	# movabsq $-9150747060186627967, %rcx # imm = 0x8102040810204081
	# imulq %rdi, %rcx
	# orq %rcx, %rax
	# movabsq $72340172838076673, %rcx # imm = 0x101010101010101
	# andq %rax, %rcx
	# vmovq %rcx, %xmm0
	# retq

	function bitunpack3(x::UInt8)
	# bit2vec_rev
	m1 = interleavemask(UInt64,9,1,0) # 0x8040201008040201
	m2 = interleavemask(UInt64,8,1,7) # 0x8080808080808080
	v = reinterpret(Vec{8,Bool},((x*m1) & m2) >> (8-1))
	return shufflevector(v,Val((7,6,5,4,3,2,1,0)))
	end
	# bitunpack3
	# movl %edi, %eax
	# movabsq $-9205322385119247871, %rcx # imm = 0x8040201008040201
	# imulq %rax, %rcx
	# shrq $7, %rcx
	# movabsq $72340172838076673, %rax # imm = 0x101010101010101
	# andq %rcx, %rax
	# vmovq %rax, %xmm0
	# movabsq $.rodata.cst16, %rax
	# vpshufb (%rax), %xmm0, %xmm0
	# retq

	const bitpack = bitpack2
	const bitunpack = bitunpack2
	const bitpack_rev = bitpack2_rev
	const bitunpack_rev = bitunpack1_rev

	for x in UnitRange(typemin(UInt8),typemax(UInt8))
	@assert bitpack( bitunpack( x)) == x
	@assert bitpack_rev(bitunpack_rev(x)) == x
	v = bitunpack(x)
	@assert bitpack1(v) == bitpack2(v) == bitpack3(v)
	@assert (( reinterpret(UInt64,bitunpack1(x))
	== reinterpret(UInt64,bitunpack2(x))
	== reinterpret(UInt64,bitunpack3(x))
	))
	end

	println("bitpack1" ); code_native(bitpack1 ,(Vec{8,Bool},); debuginfo=:none)
	println("bitpack2_rev" ); code_native(bitpack2_rev ,(Vec{8,Bool},); debuginfo=:none)
	println("bitpack2" ); code_native(bitpack2 ,(Vec{8,Bool},); debuginfo=:none)
	println("bitpack3" ); code_native(bitpack3 ,(Vec{8,Bool},); debuginfo=:none)
	println("bitunpack1_rev"); code_native(bitunpack1_rev,(UInt8 ,); debuginfo=:none)
	println("bitunpack1" ); code_native(bitunpack1 ,(UInt8 ,); debuginfo=:none)
	println("bitunpack2" ); code_native(bitunpack2 ,(UInt8 ,); debuginfo=:none)
	println("bitunpack3" ); code_native(bitunpack3 ,(UInt8 ,); debuginfo=:none)

	x = rand(UInt8)
	v = bitunpack(x)
	println("bitpack1" ); display(@benchmark bitpack1( $v))
	println("bitpack2_rev" ); display(@benchmark bitpack2_rev( $v))
	println("bitpack2" ); display(@benchmark bitpack2( $v))
	println("bitpack3" ); display(@benchmark bitpack3( $v))
	println("bitunpack1_rev"); display(@benchmark bitunpack1_rev($x))
	println("bitunpack1" ); display(@benchmark bitunpack1( $x))
	println("bitunpack2" ); display(@benchmark bitunpack2( $x))
	println("bitunpack3" ); display(@benchmark bitunpack3( $x))