Last active
August 14, 2025 14:07
-
-
Save ArhanChaudhary/51bef8e39312f45f8d9c9cdc44f1f9ba to your computer and use it in GitHub Desktop.
Fastest Rubik's cube composition and inversion operations (see https://godbolt.org/z/eYevrErG7)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NOTE: all instruction counts ignore the initial and final SIMD register load instructions | |
AVX2 composition: 5 instructions, averaging 700M/sec on an Intel Xeon E5-2667 v3 | |
AVX2 inversion: 26 instructions, averaging 175M/sec on an Intel Xeon E5-2667 v3 | |
ARM64 composition: 19 instructions, averaging 870M/sec on an Apple M4 | |
ARM64 inversion: 48 instructions, averaging 400M/sec on an Apple M4 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(abi_vectorcall, portable_simd)] | |
use std::simd::cmp::SimdOrd; | |
use std::simd::u8x32; | |
use std::arch::x86_64::_mm256_shuffle_epi8; | |
fn avx2_swizzle_lo(a: u8x32, b: u8x32) -> u8x32 { | |
unsafe { | |
_mm256_shuffle_epi8(a.into(), b.into()).into() | |
} | |
} | |
/// A representation of a 3x3 cube in a __m256i vector. The following design | |
/// has been taken from Andrew Skalski's [vcube]. | |
/// | |
/// Low 128 bits: | |
/// | |
/// ```text | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ----11-- | |
/// ----11-1 | |
/// ----111- | |
/// ----1111 | |
/// ``` | |
/// | |
/// dash = unused (zero) \ | |
/// E = edge index (0-11) \ | |
/// O = edge orientation (0-1) | |
/// | |
/// High 128 bits: | |
/// | |
/// ```text | |
/// --OO-CCC | |
/// --OO-CCC | |
/// --OO-CCC | |
/// --OO-CCC | |
/// --OO-CCC | |
/// --OO-CCC | |
/// --OO-CCC | |
/// --OO-CCC | |
/// ----1--- | |
/// ----1--1 | |
/// ----1-1- | |
/// ----1-11 | |
/// ----11-- | |
/// ----11-1 | |
/// ----111- | |
/// ----1111 | |
/// ``` | |
/// | |
/// dash = unused (zero) \ | |
/// C = corner index (0-7) \ | |
/// O = corner orientation (0-2) | |
/// | |
/// It is important for the unused bytes to correspond to their index for the | |
/// `_mm256_shuffle_epi8` instruction to work correctly. | |
/// | |
/// [vcube]: https://github.com/Voltara/vcube | |
pub struct Cube3(u8x32); | |
/// Extract the orientation bits from the cube state. | |
const ORI_MASK: u8x32 = u8x32::splat(0b0011_0000); | |
/// The carry constant used to fix orientation bits after permutation. | |
const ORI_CARRY: u8x32 = u8x32::from_array([ | |
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, | |
0x20, 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, | |
0x30, 0x30, 0x30, 0x30, | |
]); | |
/// Extract the permutation bits from the cube state. | |
const PERM_MASK: u8x32 = u8x32::splat(0b0000_1111); | |
/// The carry constant used to fix orientation bits after inversion. | |
const ORI_CARRY_INVERSE: u8x32 = u8x32::from_array([ | |
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, | |
0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, | |
]); | |
impl Cube3 { | |
/// Compose two Rubik's cube states (apply `b` to `a`) | |
/// | |
/// In practice the three vmovqda instructions and the vzeroupper | |
/// instruction would be optimized away by the "vectorcall" ABI. | |
/// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170 | |
pub extern "vectorcall" fn replace_compose(&mut self, a: &Self, b: &Self) { | |
// First use _mm256_shuffle_epi8 to compose the permutation. Note | |
// that the SIMD instruction shuffles its argument bytes by the | |
// lower four bits of the second argument, meaning orientation will | |
// not interfere. | |
let mut composed = avx2_swizzle_lo(a.0, b.0); | |
// Composing permutation composes the orientation bits too. "The | |
// Cubie Level" of Kociemba's [website] explains that orientation | |
// during composition changes like so: (A*B)(x).o=A(B(x).c).o+B(x).o | |
// We've just done the first part, so we now need to add the add | |
// the orientation bits of the second argument to the first. | |
// | |
// [website]: https://kociemba.org/cube.htm | |
composed += b.0 & ORI_MASK; | |
// Once added, the orientation bits may be in an invalid state. Each | |
// corner orientation index is defined as 0, 1, or 2, but it may be | |
// 4 or 5 after the addition. We subtract the value 0b0011_0000 or 3 | |
// from each orientation value and minimize it with the original | |
// value. This will do nothing to values that are already 0, 1, or 2 | |
// because of overflow, but it will set the values 3 and 4 to 0 and | |
// 1 respectively. | |
composed = composed.simd_min(composed - ORI_CARRY); | |
self.0 = composed; | |
} | |
/// Inverse a Rubik's cube state | |
/// | |
/// In practice the two vmovqda instructions and the vzeroupper | |
/// instruction would be optimized away by the "vectorcall" ABI. | |
/// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170 | |
pub extern "vectorcall" fn replace_inverse(&mut self, a: &Self) { | |
// Permutation inversion taken from Andrew Skalski's [vcube]. | |
// | |
// 27720 (11*9*8*7*5) is the LCM of all possible cycle | |
// decompositions, so we can invert the permutation by raising it to | |
// the 27719th power. The addition chain for 27719 was generated | |
// using the calculator provided by Achim Flammenkamp on his | |
// [website]. | |
// | |
// [vcube]: https://github.com/Voltara/vcube | |
// [website]: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html | |
// Extract the permutation bits from the cube state | |
let perm = a.0 & PERM_MASK; | |
let pow_3 = avx2_swizzle_lo(avx2_swizzle_lo(perm, perm), perm); | |
let mut inverse = avx2_swizzle_lo(pow_3, pow_3); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(inverse, inverse); | |
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3); | |
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm); | |
// Compose orientation as explained in `replace_compose` | |
// xoring this with `perm` does not make this faster | |
let mut added_ori = a.0 & ORI_MASK; | |
added_ori += added_ori; | |
// The orientation for edges remain the same during inversion, so | |
// we slightly modify the carry constant | |
added_ori = added_ori.simd_min(added_ori - ORI_CARRY_INVERSE); | |
// Use the inverse permutation to permute the already inversed | |
// orientation bits | |
added_ori = avx2_swizzle_lo(added_ori, inverse); | |
*self = Cube3(inverse | added_ori); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(portable_simd)] | |
use std::simd::cmp::SimdOrd; | |
use std::simd::u8x8; | |
use std::simd::u8x16; | |
/// A compressed 3x3 cube representation. The byte layout is as follows. | |
/// | |
/// `edges` | |
/// | |
/// ```text | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ---OEEEE | |
/// ----11-- | |
/// ----11-1 | |
/// ----111- | |
/// ----1111 | |
/// ``` | |
/// | |
/// dash = unused (zero) \ | |
/// E = edge index (0-11) \ | |
/// O = edge orientation (0-1) | |
/// | |
/// `corners` | |
/// | |
/// ```text | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ---OOCCC | |
/// ``` | |
/// | |
/// dash = unused (zero) \ | |
/// C = corner index (0-7) \ | |
/// O = corner orientation (0-2) | |
/// | |
/// It is important for the unused bytes to correspond to their index for the | |
/// swizzling to permute the values in place. | |
pub struct Cube3 { | |
edges: u8x16, | |
corners: u8x8, | |
} | |
/// Masks for edge and corner orientations and permutations. | |
const EDGE_ORI_MASK: u8x16 = u8x16::splat(0b0001_0000); | |
const EDGE_PERM_MASK: u8x16 = u8x16::splat(0b0000_1111); | |
const CORNER_ORI_MASK: u8x8 = u8x8::splat(0b0011_0000); | |
const CORNER_PERM_MASK: u8x8 = u8x8::splat(0b0000_0111); | |
/// A lookup table used to inverse a corner orientation. | |
const CO_INV_SWIZZLE: u8x8 = u8x8::from_array([0, 2, 1, 0, 0, 0, 0, 0]); | |
impl Cube3 { | |
/// Compose two Rubik's cube states (apply `b` to `a`) | |
pub fn replace_compose(&mut self, a: &Self, b: &Self) { | |
// We do the same thing as before | |
let mut edges_composed = a.edges.swizzle_dyn(b.edges & EDGE_PERM_MASK); | |
edges_composed ^= b.edges & EDGE_ORI_MASK; // Xor is modulo two | |
let mut corners_composed = a.corners.swizzle_dyn(b.corners & CORNER_PERM_MASK); | |
corners_composed += b.corners & CORNER_ORI_MASK; | |
corners_composed = corners_composed.simd_min(corners_composed - CORNER_ORI_MASK); | |
self.edges = edges_composed; | |
self.corners = corners_composed; | |
} | |
/// Inverse a Rubik's cube state | |
pub fn replace_inverse(&mut self, a: &Self) { | |
// We do the same thing as before for edges and corners separately | |
let ep = a.edges & EDGE_PERM_MASK; | |
let pow_3_ep = ep.swizzle_dyn(ep).swizzle_dyn(ep); | |
let mut inverse_ep = pow_3_ep.swizzle_dyn(pow_3_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep); | |
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep); | |
let mut inverse_eo = a.edges & EDGE_ORI_MASK; | |
// eo doesn't change during inversion; all we need to do is permute it | |
inverse_eo = inverse_eo.swizzle_dyn(inverse_ep); | |
self.edges = inverse_eo | inverse_ep; | |
let cp = a.corners & CORNER_PERM_MASK; | |
// Since there are only 8 corners we take the 839th power, or | |
// LCM(1..8) - 1 | |
let pow_3_cp = cp.swizzle_dyn(cp).swizzle_dyn(cp); | |
let pow_6_cp = pow_3_cp.swizzle_dyn(pow_3_cp); | |
let pow_7_cp = pow_6_cp.swizzle_dyn(cp); | |
let mut inverse_cp = pow_7_cp.swizzle_dyn(pow_6_cp); | |
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp); | |
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp); | |
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp); | |
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp); | |
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp); | |
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp).swizzle_dyn(pow_7_cp); | |
// Move the target bits to LSB to make them valid indexes | |
let mut inverse_co = a.corners >> 3; | |
// Like the edge orientation, corner orientation is defined as | |
// either 0, 1, or 2. Adding two corner orientations together may result | |
// in 3 or 4. It was found fastest to use a lookup table to perform | |
// this correction | |
inverse_co = CO_INV_SWIZZLE | |
.swizzle_dyn(inverse_co) | |
.swizzle_dyn(inverse_cp); | |
// Restore the original position | |
inverse_co <<= 3; | |
self.corners = inverse_co | inverse_cp; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment