Skip to content

Instantly share code, notes, and snippets.

@ArhanChaudhary
Last active August 14, 2025 14:07
Show Gist options
  • Save ArhanChaudhary/51bef8e39312f45f8d9c9cdc44f1f9ba to your computer and use it in GitHub Desktop.
Save ArhanChaudhary/51bef8e39312f45f8d9c9cdc44f1f9ba to your computer and use it in GitHub Desktop.
Fastest Rubik's cube composition and inversion operations (see https://godbolt.org/z/eYevrErG7)
NOTE: all instruction counts ignore the initial and final SIMD register load instructions
AVX2 composition: 5 instructions, averaging 700M/sec on an Intel Xeon E5-2667 v3
AVX2 inversion: 26 instructions, averaging 175M/sec on an Intel Xeon E5-2667 v3
ARM64 composition: 19 instructions, averaging 870M/sec on an Apple M4
ARM64 inversion: 48 instructions, averaging 400M/sec on an Apple M4
#![feature(abi_vectorcall, portable_simd)]
use std::simd::cmp::SimdOrd;
use std::simd::u8x32;
use std::arch::x86_64::_mm256_shuffle_epi8;
fn avx2_swizzle_lo(a: u8x32, b: u8x32) -> u8x32 {
unsafe {
_mm256_shuffle_epi8(a.into(), b.into()).into()
}
}
/// A representation of a 3x3 cube in a __m256i vector. The following design
/// has been taken from Andrew Skalski's [vcube].
///
/// Low 128 bits:
///
/// ```text
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ----11--
/// ----11-1
/// ----111-
/// ----1111
/// ```
///
/// dash = unused (zero) \
/// E = edge index (0-11) \
/// O = edge orientation (0-1)
///
/// High 128 bits:
///
/// ```text
/// --OO-CCC
/// --OO-CCC
/// --OO-CCC
/// --OO-CCC
/// --OO-CCC
/// --OO-CCC
/// --OO-CCC
/// --OO-CCC
/// ----1---
/// ----1--1
/// ----1-1-
/// ----1-11
/// ----11--
/// ----11-1
/// ----111-
/// ----1111
/// ```
///
/// dash = unused (zero) \
/// C = corner index (0-7) \
/// O = corner orientation (0-2)
///
/// It is important for the unused bytes to correspond to their index for the
/// `_mm256_shuffle_epi8` instruction to work correctly.
///
/// [vcube]: https://github.com/Voltara/vcube
pub struct Cube3(u8x32);
/// Extract the orientation bits from the cube state.
const ORI_MASK: u8x32 = u8x32::splat(0b0011_0000);
/// The carry constant used to fix orientation bits after permutation.
const ORI_CARRY: u8x32 = u8x32::from_array([
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
0x30, 0x30, 0x30, 0x30,
]);
/// Extract the permutation bits from the cube state.
const PERM_MASK: u8x32 = u8x32::splat(0b0000_1111);
/// The carry constant used to fix orientation bits after inversion.
const ORI_CARRY_INVERSE: u8x32 = u8x32::from_array([
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
]);
impl Cube3 {
/// Compose two Rubik's cube states (apply `b` to `a`)
///
/// In practice the three vmovqda instructions and the vzeroupper
/// instruction would be optimized away by the "vectorcall" ABI.
/// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170
pub extern "vectorcall" fn replace_compose(&mut self, a: &Self, b: &Self) {
// First use _mm256_shuffle_epi8 to compose the permutation. Note
// that the SIMD instruction shuffles its argument bytes by the
// lower four bits of the second argument, meaning orientation will
// not interfere.
let mut composed = avx2_swizzle_lo(a.0, b.0);
// Composing permutation composes the orientation bits too. "The
// Cubie Level" of Kociemba's [website] explains that orientation
// during composition changes like so: (A*B)(x).o=A(B(x).c).o+B(x).o
// We've just done the first part, so we now need to add the add
// the orientation bits of the second argument to the first.
//
// [website]: https://kociemba.org/cube.htm
composed += b.0 & ORI_MASK;
// Once added, the orientation bits may be in an invalid state. Each
// corner orientation index is defined as 0, 1, or 2, but it may be
// 4 or 5 after the addition. We subtract the value 0b0011_0000 or 3
// from each orientation value and minimize it with the original
// value. This will do nothing to values that are already 0, 1, or 2
// because of overflow, but it will set the values 3 and 4 to 0 and
// 1 respectively.
composed = composed.simd_min(composed - ORI_CARRY);
self.0 = composed;
}
/// Inverse a Rubik's cube state
///
/// In practice the two vmovqda instructions and the vzeroupper
/// instruction would be optimized away by the "vectorcall" ABI.
/// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170
pub extern "vectorcall" fn replace_inverse(&mut self, a: &Self) {
// Permutation inversion taken from Andrew Skalski's [vcube].
//
// 27720 (11*9*8*7*5) is the LCM of all possible cycle
// decompositions, so we can invert the permutation by raising it to
// the 27719th power. The addition chain for 27719 was generated
// using the calculator provided by Achim Flammenkamp on his
// [website].
//
// [vcube]: https://github.com/Voltara/vcube
// [website]: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
// Extract the permutation bits from the cube state
let perm = a.0 & PERM_MASK;
let pow_3 = avx2_swizzle_lo(avx2_swizzle_lo(perm, perm), perm);
let mut inverse = avx2_swizzle_lo(pow_3, pow_3);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(inverse, inverse);
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3);
inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm);
// Compose orientation as explained in `replace_compose`
// xoring this with `perm` does not make this faster
let mut added_ori = a.0 & ORI_MASK;
added_ori += added_ori;
// The orientation for edges remain the same during inversion, so
// we slightly modify the carry constant
added_ori = added_ori.simd_min(added_ori - ORI_CARRY_INVERSE);
// Use the inverse permutation to permute the already inversed
// orientation bits
added_ori = avx2_swizzle_lo(added_ori, inverse);
*self = Cube3(inverse | added_ori);
}
}
#![feature(portable_simd)]
use std::simd::cmp::SimdOrd;
use std::simd::u8x8;
use std::simd::u8x16;
/// A compressed 3x3 cube representation. The byte layout is as follows.
///
/// `edges`
///
/// ```text
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ---OEEEE
/// ----11--
/// ----11-1
/// ----111-
/// ----1111
/// ```
///
/// dash = unused (zero) \
/// E = edge index (0-11) \
/// O = edge orientation (0-1)
///
/// `corners`
///
/// ```text
/// ---OOCCC
/// ---OOCCC
/// ---OOCCC
/// ---OOCCC
/// ---OOCCC
/// ---OOCCC
/// ---OOCCC
/// ---OOCCC
/// ```
///
/// dash = unused (zero) \
/// C = corner index (0-7) \
/// O = corner orientation (0-2)
///
/// It is important for the unused bytes to correspond to their index for the
/// swizzling to permute the values in place.
pub struct Cube3 {
edges: u8x16,
corners: u8x8,
}
/// Masks for edge and corner orientations and permutations.
const EDGE_ORI_MASK: u8x16 = u8x16::splat(0b0001_0000);
const EDGE_PERM_MASK: u8x16 = u8x16::splat(0b0000_1111);
const CORNER_ORI_MASK: u8x8 = u8x8::splat(0b0011_0000);
const CORNER_PERM_MASK: u8x8 = u8x8::splat(0b0000_0111);
/// A lookup table used to inverse a corner orientation.
const CO_INV_SWIZZLE: u8x8 = u8x8::from_array([0, 2, 1, 0, 0, 0, 0, 0]);
impl Cube3 {
/// Compose two Rubik's cube states (apply `b` to `a`)
pub fn replace_compose(&mut self, a: &Self, b: &Self) {
// We do the same thing as before
let mut edges_composed = a.edges.swizzle_dyn(b.edges & EDGE_PERM_MASK);
edges_composed ^= b.edges & EDGE_ORI_MASK; // Xor is modulo two
let mut corners_composed = a.corners.swizzle_dyn(b.corners & CORNER_PERM_MASK);
corners_composed += b.corners & CORNER_ORI_MASK;
corners_composed = corners_composed.simd_min(corners_composed - CORNER_ORI_MASK);
self.edges = edges_composed;
self.corners = corners_composed;
}
/// Inverse a Rubik's cube state
pub fn replace_inverse(&mut self, a: &Self) {
// We do the same thing as before for edges and corners separately
let ep = a.edges & EDGE_PERM_MASK;
let pow_3_ep = ep.swizzle_dyn(ep).swizzle_dyn(ep);
let mut inverse_ep = pow_3_ep.swizzle_dyn(pow_3_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep);
inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep);
let mut inverse_eo = a.edges & EDGE_ORI_MASK;
// eo doesn't change during inversion; all we need to do is permute it
inverse_eo = inverse_eo.swizzle_dyn(inverse_ep);
self.edges = inverse_eo | inverse_ep;
let cp = a.corners & CORNER_PERM_MASK;
// Since there are only 8 corners we take the 839th power, or
// LCM(1..8) - 1
let pow_3_cp = cp.swizzle_dyn(cp).swizzle_dyn(cp);
let pow_6_cp = pow_3_cp.swizzle_dyn(pow_3_cp);
let pow_7_cp = pow_6_cp.swizzle_dyn(cp);
let mut inverse_cp = pow_7_cp.swizzle_dyn(pow_6_cp);
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
inverse_cp = inverse_cp.swizzle_dyn(inverse_cp).swizzle_dyn(pow_7_cp);
// Move the target bits to LSB to make them valid indexes
let mut inverse_co = a.corners >> 3;
// Like the edge orientation, corner orientation is defined as
// either 0, 1, or 2. Adding two corner orientations together may result
// in 3 or 4. It was found fastest to use a lookup table to perform
// this correction
inverse_co = CO_INV_SWIZZLE
.swizzle_dyn(inverse_co)
.swizzle_dyn(inverse_cp);
// Restore the original position
inverse_co <<= 3;
self.corners = inverse_co | inverse_cp;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment