Skip to content

Instantly share code, notes, and snippets.

@sandersaares
Last active December 31, 2024 05:22
Show Gist options
  • Save sandersaares/e0f059e7ae553e85ee64c34c334f50e6 to your computer and use it in GitHub Desktop.
Save sandersaares/e0f059e7ae553e85ee64c34c334f50e6 to your computer and use it in GitHub Desktop.
vectorized-rust
pub fn criterion_benchmark(c: &mut Criterion) {
const X_A: u64 = 94;
const X_B: u64 = 22;
const X: u64 = 8400 + 94 * 123456;
const Y_A: u64 = 34;
const Y_B: u64 = 67;
const Y: u64 = 5400 + 34 * 123456;
c.bench_function("naive", |b| {
b.iter(|| {
_ = black_box(naive_solver::solve(
black_box(X_A),
black_box(X_B),
black_box(X),
black_box(Y_A),
black_box(Y_B),
black_box(Y),
));
});
});
c.bench_function("faster_4_u64", |b| {
b.iter(|| {
_ = black_box(faster_solver_u64::solve::<4>(
black_box(X_A),
black_box(X_B),
black_box(X),
black_box(Y_A),
black_box(Y_B),
black_box(Y),
));
});
});
}
pub fn solve<const CHUNK_SIZE: usize>(x_a: u64, x_b: u64, x: u64, y_a: u64, y_b: u64, y: u64,
) -> Option<Solution>
where
LaneCount<CHUNK_SIZE>: SupportedLaneCount,
{
// .. snip ..
let max_a = (x / x_a).min(y / y_a);
let full_chunk_count = max_a / CHUNK_SIZE as u64;
for chunk_index in 0..full_chunk_count {
// .. snip ..
}
// .. snip ..
None
}
pub fn solve<const CHUNK_SIZE: usize>(x_a: u64, x_b: u64, x: u64, y_a: u64, y_b: u64, y: u64,
) -> Option<Solution>
where
LaneCount<CHUNK_SIZE>: SupportedLaneCount,
{
// .. snip ..
let max_a = (x / x_a).min(y / y_a);
let full_chunk_count = max_a / CHUNK_SIZE as u64;
// .. snip ..
// Values for A that fall into the final partial chunk. May be an empty range.
let mut partial_chunk_candidates = (CHUNK_SIZE as u64 * full_chunk_count)..=max_a;
// If there was any part of the sequence that didn't fit into a full chunk,
// we process it with the naive algorithm.
partial_chunk_candidates.find_map(|a| evaluate_naive(a, x_a, x_b, x, y_a, y_b, y))
}
pub fn solve<const CHUNK_SIZE: usize>(x_a: u64, x_b: u64, x: u64, y_a: u64, y_b: u64, y: u64,
) -> Option<Solution>
where
LaneCount<CHUNK_SIZE>: SupportedLaneCount,
{
// We will prepare vector versions of our constants
// as vector operations require vectors for input.
let x_a_vec = Simd::splat(x_a);
let x_b_vec = Simd::splat(x_b);
let x_vec = Simd::splat(x);
let y_a_vec = Simd::splat(y_a);
let y_b_vec = Simd::splat(y_b);
let y_vec = Simd::splat(y);
// .. snip ..
}
pub fn solve(x_a: u64, x_b: u64, x: u64, y_a: u64, y_b: u64, y: u64) -> Option<Solution> {
let max_a = (x / x_a).min(y / y_a);
(0..=max_a).find_map(|a| evaluate_naive(a, x_a, x_b, x, y_a, y_b, y))
}
pub(crate) fn evaluate_naive(
a_candidate: u64,
x_a: u64,
x_b: u64,
x: u64,
y_a: u64,
y_b: u64,
y: u64,
) -> Option<Solution> {
// If we assume the given A, what is the expected value of Xb * B?
let b_component_x = x - x_a * a_candidate;
// If not evenly divisible, there is no solution with this A.
if b_component_x % x_b != 0 {
return None;
}
let b_component_y = y - y_a * a_candidate;
// If not evenly divisible, there is no solution with this A.
if b_component_y % y_b != 0 {
return None;
}
let b_x = b_component_x / x_b;
let b_y = b_component_y / y_b;
// If values for B do not match, there is no solution with this A.
if b_x != b_y {
return None;
}
Some(Solution {
a: a_candidate,
b: b_x,
})
}
#[derive(Debug, PartialEq, Eq)]
pub struct Solution {
pub a: u64,
pub b: u64,
}
fn main() {
let start = Instant::now();
let solution = faster_solver_u64::solve::<4>(
black_box(26),
black_box(67),
black_box(12748 + 10000000000000),
black_box(66),
black_box(21),
black_box(12176 + 10000000000000),
);
assert_eq!(
solution,
Some(Solution {
a: 118679050709,
b: 103199174542
})
);
let duration = start.elapsed();
println!("Time elapsed: {} milliseconds", duration.as_millis());
}
fn evaluate_chunk<const CHUNK_SIZE: usize>(
a_candidates: Simd<u64, CHUNK_SIZE>,
x_a: Simd<u64, CHUNK_SIZE>,
x_b: Simd<u64, CHUNK_SIZE>,
x: Simd<u64, CHUNK_SIZE>,
y_a: Simd<u64, CHUNK_SIZE>,
y_b: Simd<u64, CHUNK_SIZE>,
y: Simd<u64, CHUNK_SIZE>,
) -> Mask<i64, CHUNK_SIZE>
where
LaneCount<CHUNK_SIZE>: SupportedLaneCount,
{
// If we assume the given A, what is the expected value of Xb * B?
let b_component_x = x - x_a * a_candidates;
// Probe what a matching value for B might be for our current A.
let b_x = b_component_x / x_b;
// If not evenly divisible to yield a B, there is no solution with this A.
let is_evenly_divisible_x = (b_x * x_b).simd_eq(b_component_x);
// Do the same for Y.
let b_component_y = y - y_a * a_candidates;
let b_y = b_component_y / y_b;
let is_evenly_divisible_y = (b_y * y_b).simd_eq(b_component_y);
// Both the X and Y equations must yield the same value for B.
let is_equal_x_y = b_x.simd_eq(b_y);
// Combine all the conditions to yield an "is valid solution" boolean.
is_evenly_divisible_x & is_evenly_divisible_y & is_equal_x_y
}
pub fn solve<const CHUNK_SIZE: usize>(x_a: u64, x_b: u64, x: u64, y_a: u64, y_b: u64, y: u64
) -> Option<Solution>
where
LaneCount<CHUNK_SIZE>: SupportedLaneCount,
{
// We will prepare vector versions of our constants
// as vector operations require vectors for input.
let x_a_vec = Simd::splat(x_a);
let x_b_vec = Simd::splat(x_b);
let x_vec = Simd::splat(x);
let y_a_vec = Simd::splat(y_a);
let y_b_vec = Simd::splat(y_b);
let y_vec = Simd::splat(y);
// This is a vector with the offsets used to go from a vector with the first
// candidate A (splatted to all lanes) to all the candidates in a chunk.
let mut candidate_offsets = Simd::splat(0);
for lane_index in 0..CHUNK_SIZE {
candidate_offsets[lane_index] = lane_index as u64;
}
let max_a = (x / x_a).min(y / y_a);
let full_chunk_count = max_a / CHUNK_SIZE as u64;
for chunk_index in 0..full_chunk_count {
let first_candidate_in_chunk = chunk_index * CHUNK_SIZE as u64;
// This gives us a vector with all the candidate A values in this chunk.
// e.g. [0, 1, 2, 3] for the first chunk.
let a_candidates =
Simd::<_, CHUNK_SIZE>::splat(first_candidate_in_chunk) + candidate_offsets;
let result = evaluate_chunk(a_candidates, x_a_vec, x_b_vec, x_vec, y_a_vec, y_b_vec, y_vec);
// We get booleans as output of the evaluation, answering "is this A a valid solution".
// Notably, the evaluation result does not give the actual numeric value for B.
// Therefore, if we found a valid solution, we still need to calculate B,
// which we do by falling back to the naive algorithm.
if result.any() {
let solution_index = result
.to_array()
.iter()
.position(|&x| x)
.expect("we verified that at least one element is true");
let a = a_candidates.as_array()[solution_index];
return Some(
evaluate_naive(a as u64, x_a, x_b, x, y_a, y_b, y)
.expect("we verified that this is a valid solution"),
);
}
}
// Values for A that fall into the final partial chunk. May be an empty range.
let mut partial_chunk_candidates = (CHUNK_SIZE as u64 * full_chunk_count)..=max_a;
// If there was any part of the sequence that didn't fit into a full chunk,
// we process it with the naive algorithm.
partial_chunk_candidates.find_map(|a| evaluate_naive(a, x_a, x_b, x, y_a, y_b, y))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment