ArhanChaudhary · August 14, 2025 14:07
diff --git a/_fastest_rubiks_operations b/_fastest_rubiks_operations
 NOTE: all instruction counts ignore the initial and final SIMD register load instructions

 AVX2 composition:  5  instructions, averaging 700M/sec on an Intel Xeon E5-2667 v3
 AVX2 inversion:    26 instructions, averaging 175M/sec on an Intel Xeon E5-2667 v3
 ARM64 composition: 19 instructions, averaging 870M/sec on an Apple M4
 ARM64 inversion:   48 instructions, averaging 400M/sec on an Apple M4
diff --git a/avx2_composition.rs b/avx2_composition.rs
 #![feature(abi_vectorcall, portable_simd)]

 use std::simd::cmp::SimdOrd;
 use std::simd::u8x32;
 use std::arch::x86_64::_mm256_shuffle_epi8;

 fn avx2_swizzle_lo(a: u8x32, b: u8x32) -> u8x32 {
    unsafe {
        _mm256_shuffle_epi8(a.into(), b.into()).into()
    }
 }

 /// A representation of a 3x3 cube in a __m256i vector. The following design
 /// has been taken from Andrew Skalski's [vcube].
 ///
 /// Low 128 bits:
 ///
 /// ```text
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ----11--
 /// ----11-1
 /// ----111-
 /// ----1111
 /// ```
 ///
 /// dash = unused (zero) \
 /// E = edge index (0-11) \
 /// O = edge orientation (0-1)
 ///
 /// High 128 bits:
 ///
 /// ```text
 /// --OO-CCC
 /// --OO-CCC
 /// --OO-CCC
 /// --OO-CCC
 /// --OO-CCC
 /// --OO-CCC
 /// --OO-CCC
 /// --OO-CCC
 /// ----1---
 /// ----1--1
 /// ----1-1-
 /// ----1-11
 /// ----11--
 /// ----11-1
 /// ----111-
 /// ----1111
 /// ```
 ///
 /// dash = unused (zero) \
 /// C = corner index (0-7) \
 /// O = corner orientation (0-2)
 ///
 /// It is important for the unused bytes to correspond to their index for the
 /// `_mm256_shuffle_epi8` instruction to work correctly.
 ///
 /// [vcube]: https://github.com/Voltara/vcube
 pub struct Cube3(u8x32);

 /// Extract the orientation bits from the cube state.
 const ORI_MASK: u8x32 = u8x32::splat(0b0011_0000);
 /// The carry constant used to fix orientation bits after permutation.
 const ORI_CARRY: u8x32 = u8x32::from_array([
    0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    0x20, 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
    0x30, 0x30, 0x30, 0x30,
 ]);
 /// Extract the permutation bits from the cube state.
 const PERM_MASK: u8x32 = u8x32::splat(0b0000_1111);
 /// The carry constant used to fix orientation bits after inversion.
 const ORI_CARRY_INVERSE: u8x32 = u8x32::from_array([
    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
    0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
 ]);

 impl Cube3 {
    /// Compose two Rubik's cube states (apply `b` to `a`)
    ///
    /// In practice the three vmovqda instructions and the vzeroupper
    /// instruction would be optimized away by the "vectorcall" ABI.
    /// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170
    pub extern "vectorcall" fn replace_compose(&mut self, a: &Self, b: &Self) {
        // First use _mm256_shuffle_epi8 to compose the permutation. Note
        // that the SIMD instruction shuffles its argument bytes by the
        // lower four bits of the second argument, meaning orientation will
        // not interfere.
        let mut composed = avx2_swizzle_lo(a.0, b.0);
        // Composing permutation composes the orientation bits too. "The
        // Cubie Level" of Kociemba's [website] explains that orientation
        // during composition changes like so: (A*B)(x).o=A(B(x).c).o+B(x).o
        // We've just done the first part, so we now need to add the add
        // the orientation bits of the second argument to the first.
        //
        // [website]: https://kociemba.org/cube.htm
        composed += b.0 & ORI_MASK;
        // Once added, the orientation bits may be in an invalid state. Each
        // corner orientation index is defined as 0, 1, or 2, but it may be
        // 4 or 5 after the addition. We subtract the value 0b0011_0000 or 3
        // from each orientation value and minimize it with the original
        // value. This will do nothing to values that are already 0, 1, or 2
        // because of overflow, but it will set the values 3 and 4 to 0 and
        // 1 respectively.
        composed = composed.simd_min(composed - ORI_CARRY);

        self.0 = composed;
    }

    /// Inverse a Rubik's cube state
    ///
    /// In practice the two vmovqda instructions and the vzeroupper
    /// instruction would be optimized away by the "vectorcall" ABI.
    /// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170
    pub extern "vectorcall" fn replace_inverse(&mut self, a: &Self) {
        // Permutation inversion taken from Andrew Skalski's [vcube].
        //
        // 27720 (11*9*8*7*5) is the LCM of all possible cycle
        // decompositions, so we can invert the permutation by raising it to
        // the 27719th power. The addition chain for 27719 was generated
        // using the calculator provided by Achim Flammenkamp on his
        // [website].
        //
        // [vcube]: https://github.com/Voltara/vcube
        // [website]: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html

        // Extract the permutation bits from the cube state
        let perm = a.0 & PERM_MASK;

        let pow_3 = avx2_swizzle_lo(avx2_swizzle_lo(perm, perm), perm);
        let mut inverse = avx2_swizzle_lo(pow_3, pow_3);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(inverse, inverse);
        inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3);
        inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm);

        // Compose orientation as explained in `replace_compose`
        // xoring this with `perm` does not make this faster
        let mut added_ori = a.0 & ORI_MASK;
        added_ori += added_ori;
        // The orientation for edges remain the same during inversion, so
        // we slightly modify the carry constant
        added_ori = added_ori.simd_min(added_ori - ORI_CARRY_INVERSE);
        // Use the inverse permutation to permute the already inversed
        // orientation bits
        added_ori = avx2_swizzle_lo(added_ori, inverse);
        *self = Cube3(inverse | added_ori);
    }
 }
diff --git a/simd8and16_composition.rs b/simd8and16_composition.rs
 #![feature(portable_simd)]

 use std::simd::cmp::SimdOrd;
 use std::simd::u8x8;
 use std::simd::u8x16;

 /// A compressed 3x3 cube representation. The byte layout is as follows.
 ///
 /// `edges`
 ///
 /// ```text
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ---OEEEE
 /// ----11--
 /// ----11-1
 /// ----111-
 /// ----1111
 /// ```
 ///
 /// dash = unused (zero) \
 /// E = edge index (0-11) \
 /// O = edge orientation (0-1)
 ///
 /// `corners`
 ///
 /// ```text
 /// ---OOCCC
 /// ---OOCCC
 /// ---OOCCC
 /// ---OOCCC
 /// ---OOCCC
 /// ---OOCCC
 /// ---OOCCC
 /// ---OOCCC
 /// ```
 ///
 /// dash = unused (zero) \
 /// C = corner index (0-7) \
 /// O = corner orientation (0-2)
 ///
 /// It is important for the unused bytes to correspond to their index for the
 /// swizzling to permute the values in place.
 pub struct Cube3 {
    edges: u8x16,
    corners: u8x8,
 }

 /// Masks for edge and corner orientations and permutations.
 const EDGE_ORI_MASK: u8x16 = u8x16::splat(0b0001_0000);
 const EDGE_PERM_MASK: u8x16 = u8x16::splat(0b0000_1111);
 const CORNER_ORI_MASK: u8x8 = u8x8::splat(0b0011_0000);
 const CORNER_PERM_MASK: u8x8 = u8x8::splat(0b0000_0111);
 /// A lookup table used to inverse a corner orientation.
 const CO_INV_SWIZZLE: u8x8 = u8x8::from_array([0, 2, 1, 0, 0, 0, 0, 0]);

 impl Cube3 {
    /// Compose two Rubik's cube states (apply `b` to `a`)
    pub fn replace_compose(&mut self, a: &Self, b: &Self) {
        // We do the same thing as before

        let mut edges_composed = a.edges.swizzle_dyn(b.edges & EDGE_PERM_MASK);
        edges_composed ^= b.edges & EDGE_ORI_MASK; // Xor is modulo two

        let mut corners_composed = a.corners.swizzle_dyn(b.corners & CORNER_PERM_MASK);
        corners_composed += b.corners & CORNER_ORI_MASK;
        corners_composed = corners_composed.simd_min(corners_composed - CORNER_ORI_MASK);

        self.edges = edges_composed;
        self.corners = corners_composed;
    }

    /// Inverse a Rubik's cube state
    pub fn replace_inverse(&mut self, a: &Self) {
        // We do the same thing as before for edges and corners separately

        let ep = a.edges & EDGE_PERM_MASK;

        let pow_3_ep = ep.swizzle_dyn(ep).swizzle_dyn(ep);
        let mut inverse_ep = pow_3_ep.swizzle_dyn(pow_3_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep);
        inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep);
        let mut inverse_eo = a.edges & EDGE_ORI_MASK;
        // eo doesn't change during inversion; all we need to do is permute it
        inverse_eo = inverse_eo.swizzle_dyn(inverse_ep);
        self.edges = inverse_eo | inverse_ep;

        let cp = a.corners & CORNER_PERM_MASK;

        // Since there are only 8 corners we take the 839th power, or
        // LCM(1..8) - 1
        let pow_3_cp = cp.swizzle_dyn(cp).swizzle_dyn(cp);
        let pow_6_cp = pow_3_cp.swizzle_dyn(pow_3_cp);
        let pow_7_cp = pow_6_cp.swizzle_dyn(cp);
        let mut inverse_cp = pow_7_cp.swizzle_dyn(pow_6_cp);
        inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
        inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
        inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
        inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
        inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
        inverse_cp = inverse_cp.swizzle_dyn(inverse_cp).swizzle_dyn(pow_7_cp);

        // Move the target bits to LSB to make them valid indexes
        let mut inverse_co = a.corners >> 3;
        // Like the edge orientation, corner orientation is defined as
        // either 0, 1, or 2. Adding two corner orientations together may result
        // in 3 or 4. It was found fastest to use a lookup table to perform
        // this correction
        inverse_co = CO_INV_SWIZZLE
            .swizzle_dyn(inverse_co)
            .swizzle_dyn(inverse_cp);
        // Restore the original position
        inverse_co <<= 3;
        self.corners = inverse_co | inverse_cp;
    }
 }
	NOTE: all instruction counts ignore the initial and final SIMD register load instructions

	AVX2 composition: 5 instructions, averaging 700M/sec on an Intel Xeon E5-2667 v3
	AVX2 inversion: 26 instructions, averaging 175M/sec on an Intel Xeon E5-2667 v3
	ARM64 composition: 19 instructions, averaging 870M/sec on an Apple M4
	ARM64 inversion: 48 instructions, averaging 400M/sec on an Apple M4
	#![feature(abi_vectorcall, portable_simd)]

	use std::simd::cmp::SimdOrd;
	use std::simd::u8x32;
	use std::arch::x86_64::_mm256_shuffle_epi8;

	fn avx2_swizzle_lo(a: u8x32, b: u8x32) -> u8x32 {
	unsafe {
	_mm256_shuffle_epi8(a.into(), b.into()).into()
	}
	}

	/// A representation of a 3x3 cube in a __m256i vector. The following design
	/// has been taken from Andrew Skalski's [vcube].
	///
	/// Low 128 bits:
	///
	/// ```text
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ----11--
	/// ----11-1
	/// ----111-
	/// ----1111
	/// ```
	///
	/// dash = unused (zero) \
	/// E = edge index (0-11) \
	/// O = edge orientation (0-1)
	///
	/// High 128 bits:
	///
	/// ```text
	/// --OO-CCC
	/// --OO-CCC
	/// --OO-CCC
	/// --OO-CCC
	/// --OO-CCC
	/// --OO-CCC
	/// --OO-CCC
	/// --OO-CCC
	/// ----1---
	/// ----1--1
	/// ----1-1-
	/// ----1-11
	/// ----11--
	/// ----11-1
	/// ----111-
	/// ----1111
	/// ```
	///
	/// dash = unused (zero) \
	/// C = corner index (0-7) \
	/// O = corner orientation (0-2)
	///
	/// It is important for the unused bytes to correspond to their index for the
	/// `_mm256_shuffle_epi8` instruction to work correctly.
	///
	/// [vcube]: https://github.com/Voltara/vcube
	pub struct Cube3(u8x32);

	/// Extract the orientation bits from the cube state.
	const ORI_MASK: u8x32 = u8x32::splat(0b0011_0000);
	/// The carry constant used to fix orientation bits after permutation.
	const ORI_CARRY: u8x32 = u8x32::from_array([
	0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
	0x20, 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
	0x30, 0x30, 0x30, 0x30,
	]);
	/// Extract the permutation bits from the cube state.
	const PERM_MASK: u8x32 = u8x32::splat(0b0000_1111);
	/// The carry constant used to fix orientation bits after inversion.
	const ORI_CARRY_INVERSE: u8x32 = u8x32::from_array([
	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
	0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
	]);

	impl Cube3 {
	/// Compose two Rubik's cube states (apply `b` to `a`)
	///
	/// In practice the three vmovqda instructions and the vzeroupper
	/// instruction would be optimized away by the "vectorcall" ABI.
	/// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170
	pub extern "vectorcall" fn replace_compose(&mut self, a: &Self, b: &Self) {
	// First use _mm256_shuffle_epi8 to compose the permutation. Note
	// that the SIMD instruction shuffles its argument bytes by the
	// lower four bits of the second argument, meaning orientation will
	// not interfere.
	let mut composed = avx2_swizzle_lo(a.0, b.0);
	// Composing permutation composes the orientation bits too. "The
	// Cubie Level" of Kociemba's [website] explains that orientation
	// during composition changes like so: (A*B)(x).o=A(B(x).c).o+B(x).o
	// We've just done the first part, so we now need to add the add
	// the orientation bits of the second argument to the first.
	//
	// [website]: https://kociemba.org/cube.htm
	composed += b.0 & ORI_MASK;
	// Once added, the orientation bits may be in an invalid state. Each
	// corner orientation index is defined as 0, 1, or 2, but it may be
	// 4 or 5 after the addition. We subtract the value 0b0011_0000 or 3
	// from each orientation value and minimize it with the original
	// value. This will do nothing to values that are already 0, 1, or 2
	// because of overflow, but it will set the values 3 and 4 to 0 and
	// 1 respectively.
	composed = composed.simd_min(composed - ORI_CARRY);

	self.0 = composed;
	}

	/// Inverse a Rubik's cube state
	///
	/// In practice the two vmovqda instructions and the vzeroupper
	/// instruction would be optimized away by the "vectorcall" ABI.
	/// See https://learn.microsoft.com/en-us/cpp/cpp/vectorcall?view=msvc-170
	pub extern "vectorcall" fn replace_inverse(&mut self, a: &Self) {
	// Permutation inversion taken from Andrew Skalski's [vcube].
	//
	// 27720 (119875) is the LCM of all possible cycle
	// decompositions, so we can invert the permutation by raising it to
	// the 27719th power. The addition chain for 27719 was generated
	// using the calculator provided by Achim Flammenkamp on his
	// [website].
	//
	// [vcube]: https://github.com/Voltara/vcube
	// [website]: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html

	// Extract the permutation bits from the cube state
	let perm = a.0 & PERM_MASK;

	let pow_3 = avx2_swizzle_lo(avx2_swizzle_lo(perm, perm), perm);
	let mut inverse = avx2_swizzle_lo(pow_3, pow_3);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(inverse, inverse);
	inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), pow_3);
	inverse = avx2_swizzle_lo(avx2_swizzle_lo(inverse, inverse), perm);

	// Compose orientation as explained in `replace_compose`
	// xoring this with `perm` does not make this faster
	let mut added_ori = a.0 & ORI_MASK;
	added_ori += added_ori;
	// The orientation for edges remain the same during inversion, so
	// we slightly modify the carry constant
	added_ori = added_ori.simd_min(added_ori - ORI_CARRY_INVERSE);
	// Use the inverse permutation to permute the already inversed
	// orientation bits
	added_ori = avx2_swizzle_lo(added_ori, inverse);
	*self = Cube3(inverse \| added_ori);
	}
	}
	#![feature(portable_simd)]

	use std::simd::cmp::SimdOrd;
	use std::simd::u8x8;
	use std::simd::u8x16;

	/// A compressed 3x3 cube representation. The byte layout is as follows.
	///
	/// `edges`
	///
	/// ```text
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ---OEEEE
	/// ----11--
	/// ----11-1
	/// ----111-
	/// ----1111
	/// ```
	///
	/// dash = unused (zero) \
	/// E = edge index (0-11) \
	/// O = edge orientation (0-1)
	///
	/// `corners`
	///
	/// ```text
	/// ---OOCCC
	/// ---OOCCC
	/// ---OOCCC
	/// ---OOCCC
	/// ---OOCCC
	/// ---OOCCC
	/// ---OOCCC
	/// ---OOCCC
	/// ```
	///
	/// dash = unused (zero) \
	/// C = corner index (0-7) \
	/// O = corner orientation (0-2)
	///
	/// It is important for the unused bytes to correspond to their index for the
	/// swizzling to permute the values in place.
	pub struct Cube3 {
	edges: u8x16,
	corners: u8x8,
	}

	/// Masks for edge and corner orientations and permutations.
	const EDGE_ORI_MASK: u8x16 = u8x16::splat(0b0001_0000);
	const EDGE_PERM_MASK: u8x16 = u8x16::splat(0b0000_1111);
	const CORNER_ORI_MASK: u8x8 = u8x8::splat(0b0011_0000);
	const CORNER_PERM_MASK: u8x8 = u8x8::splat(0b0000_0111);
	/// A lookup table used to inverse a corner orientation.
	const CO_INV_SWIZZLE: u8x8 = u8x8::from_array([0, 2, 1, 0, 0, 0, 0, 0]);

	impl Cube3 {
	/// Compose two Rubik's cube states (apply `b` to `a`)
	pub fn replace_compose(&mut self, a: &Self, b: &Self) {
	// We do the same thing as before

	let mut edges_composed = a.edges.swizzle_dyn(b.edges & EDGE_PERM_MASK);
	edges_composed ^= b.edges & EDGE_ORI_MASK; // Xor is modulo two

	let mut corners_composed = a.corners.swizzle_dyn(b.corners & CORNER_PERM_MASK);
	corners_composed += b.corners & CORNER_ORI_MASK;
	corners_composed = corners_composed.simd_min(corners_composed - CORNER_ORI_MASK);

	self.edges = edges_composed;
	self.corners = corners_composed;
	}

	/// Inverse a Rubik's cube state
	pub fn replace_inverse(&mut self, a: &Self) {
	// We do the same thing as before for edges and corners separately

	let ep = a.edges & EDGE_PERM_MASK;

	let pow_3_ep = ep.swizzle_dyn(ep).swizzle_dyn(ep);
	let mut inverse_ep = pow_3_ep.swizzle_dyn(pow_3_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(pow_3_ep);
	inverse_ep = inverse_ep.swizzle_dyn(inverse_ep).swizzle_dyn(ep);
	let mut inverse_eo = a.edges & EDGE_ORI_MASK;
	// eo doesn't change during inversion; all we need to do is permute it
	inverse_eo = inverse_eo.swizzle_dyn(inverse_ep);
	self.edges = inverse_eo \| inverse_ep;

	let cp = a.corners & CORNER_PERM_MASK;

	// Since there are only 8 corners we take the 839th power, or
	// LCM(1..8) - 1
	let pow_3_cp = cp.swizzle_dyn(cp).swizzle_dyn(cp);
	let pow_6_cp = pow_3_cp.swizzle_dyn(pow_3_cp);
	let pow_7_cp = pow_6_cp.swizzle_dyn(cp);
	let mut inverse_cp = pow_7_cp.swizzle_dyn(pow_6_cp);
	inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
	inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
	inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
	inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
	inverse_cp = inverse_cp.swizzle_dyn(inverse_cp);
	inverse_cp = inverse_cp.swizzle_dyn(inverse_cp).swizzle_dyn(pow_7_cp);

	// Move the target bits to LSB to make them valid indexes
	let mut inverse_co = a.corners >> 3;
	// Like the edge orientation, corner orientation is defined as
	// either 0, 1, or 2. Adding two corner orientations together may result
	// in 3 or 4. It was found fastest to use a lookup table to perform
	// this correction
	inverse_co = CO_INV_SWIZZLE
	.swizzle_dyn(inverse_co)
	.swizzle_dyn(inverse_cp);
	// Restore the original position
	inverse_co <<= 3;
	self.corners = inverse_co \| inverse_cp;
	}
	}