Skip to content

Instantly share code, notes, and snippets.

@findepi
Last active February 12, 2025 15:43
Show Gist options
  • Save findepi/89497d13a3a249a1d2d1b6d7c2f8b927 to your computer and use it in GitHub Desktop.
Save findepi/89497d13a3a249a1d2d1b6d7c2f8b927 to your computer and use it in GitHub Desktop.
Allocation elimination, loops and benchmark results

This is collected information related to

See 20_bench_code.rs for the code under study. See 21_bench_results.md for benchmark results (detailed results on a laptop and a baseline results from EC2 Graviton). The benchmarks were run with a release version of rustc.

See 30_rustc_patch.diff for the compiler patch addressing the rust-lang/rust#128854 issue. With the patch, the benchmarked functions compile to the same assembly. Yet, they perform widely different when embedded in a loop (as the benchmark does and as the calling code might do).

#[macro_use]
extern crate criterion;
use crate::criterion::Criterion;
use std::time::Duration;
use itertools::izip;
use rand::Rng;
type Result<T> = std::result::Result<T, String>;
// #[no_mangle]
// #[inline(always)]
// #[inline(never)]
fn simple_sum(a: i32, b: i32, c: i32, d: i32) -> Result<i32> {
Ok(a + b + c + d)
}
// #[no_mangle]
// #[inline(always)]
// #[inline(never)]
fn curried_sum(a: i32, b: i32, c: i32, d: i32) -> Result<i32> {
Ok(fn_fn_fn_fn(a)?(b)?(c)?(d)?)
}
fn fn_fn_fn_fn(
a: i32,
) -> Result<Box<dyn Fn(i32) -> Result<Box<dyn Fn(i32) -> Result<Box<dyn Fn(i32) -> Result<i32>>>>>>>
{
Ok(Box::new(move |b| {
Ok(Box::new(move |c| Ok(Box::new(move |d| Ok(a + b + c + d)))))
}))
}
// #[no_mangle]
// #[inline(always)]
// #[inline(never)]
fn continuation_sum(a: i32, b: i32, c: i32, d: i32) -> Result<i32> {
fn_continuation(a, &|after_a| /*_*/ {
after_a?(b, &|after_b| /*_*/ {
after_b?(c, &|after_c| /*_*/ {
after_c?(d)
})
})
})
}
fn fn_continuation(
a: i32,
after_a: &dyn Fn(
Result<
&dyn Fn(
/*b*/ i32,
/* after_b */
&dyn Fn(
Result<
&dyn Fn(
/*c*/ i32,
/* after_c */
&dyn Fn(
// calculate final result
Result<&dyn Fn(/*d*/ i32) -> Result<i32>>,
) -> Result<i32>,
) -> Result<i32>,
>,
) -> Result<i32>,
) -> Result<i32>,
>,
) -> Result<i32>,
) -> Result<i32> {
after_a(Ok(&move |b, after_b| /*-> Result<i32> */ {
after_b(Ok(&move |c, after_c| /*-> Result<i32> */ {
after_c(Ok(&move |d| /*-> Result<i32> */ {
Ok(a + b + c + d)
}))
}))
}))
}
fn criterion_benchmark(c: &mut Criterion) {
let a_vals = generate_array();
let b_vals = generate_array();
let c_vals = generate_array();
let d_vals = generate_array();
let mut group = c.benchmark_group("function-calls");
group.measurement_time(Duration::from_secs(10));
// quick
// group.warm_up_time(Duration::from_secs(1));
// group.measurement_time(Duration::from_secs(2));
group.bench_function("simple_sum", |b| {
b.iter(|| {
let sum = izip!(&a_vals, &b_vals, &c_vals, &d_vals)
.map(|(a, b, c, d)| simple_sum(*a, *b, *c, *d).unwrap())
.sum::<i32>();
criterion::black_box(sum);
})
});
group.bench_function("curried_sum", |b| {
b.iter(|| {
let sum = izip!(&a_vals, &b_vals, &c_vals, &d_vals)
.map(|(a, b, c, d)| curried_sum(*a, *b, *c, *d).unwrap())
.sum::<i32>();
criterion::black_box(sum);
})
});
group.bench_function("continuation_sum", |b| {
b.iter(|| {
let sum = izip!(&a_vals, &b_vals, &c_vals, &d_vals)
.map(|(a, b, c, d)| continuation_sum(*a, *b, *c, *d).unwrap())
.sum::<i32>();
criterion::black_box(sum);
})
});
}
fn generate_array() -> Vec<i32> {
let mut rng = rand::rng();
(0..10_000).map(|_| rng.random_range(0..100)).collect()
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Using

benchmark results on (Apple M2)

rustc release build version rustc 1.83.0 (90b35a623 2024-11-26)

All values in µs/op. The lower the better.

variant simple_sum curried_sum continuation_sum notes
no attributes ~1.3 ~9.5 ~1.3
no attributes with nightly compiler patched 30_rustc_patch.diff ~1.3 ~9 ~1.3
#[inline(never)] ~9 ~9.5 ~9 the curried variant usually performs slightly worse, the difference is small though
#[inline(always)] 1.3 330 32 simple variant looks like it's always inlined. forced inlining of curried variant gives dramatic slowdown, similar for continuation variant

benchmark results on AWS graviton (c8g.medium, 1 vCPU 2 GB mem)

With relatively modern rustc version (1.8x). Didn't note which one.

$ cargo bench -- function-calls/
[...]

     Running benches/box_dyn_fn.rs (target/release/deps/box_dyn_fn-f6199173228c07bd)
Gnuplot not found, using plotters backend
function-calls/just_do_it
                        time:   [2.8012 µs 2.8016 µs 2.8022 µs]
Found 10 outliers among 100 measurements (10.00%)
  1 (1.00%) low severe
  2 (2.00%) low mild
  3 (3.00%) high mild
  4 (4.00%) high severe
function-calls/do_convoluted
                        time:   [11.176 µs 11.225 µs 11.311 µs]
Found 9 outliers among 100 measurements (9.00%)
  1 (1.00%) high mild
  8 (8.00%) high severe
#[macro_use]
extern crate criterion;
use crate::criterion::Criterion;
use std::time::Duration;
use itertools::izip;
use rand::Rng;
// #[no_mangle]
// #[inline(always)]
// #[inline(never)]
fn simple_sum(a: i32, b: i32) -> i32 {
a + b
}
// #[no_mangle]
// #[inline(always)]
// #[inline(never)]
fn curried_sum(a: i32, b: i32) -> i32 {
fn_curry(a)(b)
}
fn fn_curry(a: i32) -> Box<dyn Fn(i32) -> i32> {
Box::new(move |b| a + b)
}
// #[no_mangle]
// #[inline(always)]
// #[inline(never)]
fn continuation_sum(a: i32, b: i32) -> i32 {
fn_continuation(a, &|after_a| /*_*/ {
after_a(b)
})
}
fn fn_continuation(
a: i32,
after_a: &dyn Fn(
&dyn Fn(/*b*/ i32) -> i32, //
) -> i32,
) -> i32 {
after_a(&move |b| /*-> i32 */ {
a + b
})
}
fn criterion_benchmark(c: &mut Criterion) {
let a_vals = generate_array();
let b_vals = generate_array();
let mut group = c.benchmark_group("simpler-function-calls");
group.measurement_time(Duration::from_secs(10));
// quick
// group.warm_up_time(Duration::from_secs(1));
// group.measurement_time(Duration::from_secs(2));
group.bench_function("simple_sum", |b| {
b.iter(|| {
let sum = izip!(&a_vals, &b_vals)
.map(|(a, b)| simple_sum(*a, *b))
.sum::<i32>();
criterion::black_box(sum);
})
});
group.bench_function("curried_sum", |b| {
b.iter(|| {
let sum = izip!(&a_vals, &b_vals)
.map(|(a, b)| curried_sum(*a, *b))
.sum::<i32>();
criterion::black_box(sum);
})
});
group.bench_function("continuation_sum", |b| {
b.iter(|| {
let sum = izip!(&a_vals, &b_vals)
.map(|(a, b)| continuation_sum(*a, *b))
.sum::<i32>();
criterion::black_box(sum);
})
});
}
fn generate_array() -> Vec<i32> {
let mut rng = rand::rng();
(0..10_000).map(|_| rng.random_range(0..100)).collect()
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Benchmark results for 22_simpler_bench_code.rs run on Apple M2 laptop with rustc release build version rustc 1.83.0 (90b35a623 2024-11-26).

     Running benches/box_dyn_fn_simpler.rs (target/release/deps/box_dyn_fn_simpler-66e5d90928d49c7a)
simpler-function-calls/simple_sum
                        time:   [663.66 ns 664.26 ns 664.88 ns]
Found 4 outliers among 100 measurements (4.00%)
  1 (1.00%) low severe
  2 (2.00%) low mild
  1 (1.00%) high severe
simpler-function-calls/curried_sum
                        time:   [5.3519 µs 5.3554 µs 5.3587 µs]
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) low severe
  1 (1.00%) low mild
  1 (1.00%) high mild
simpler-function-calls/continuation_sum
                        time:   [662.81 ns 663.43 ns 664.06 ns]
Found 5 outliers among 100 measurements (5.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  1 (1.00%) high severe
diff --git library/alloc/src/alloc.rs library/alloc/src/alloc.rs
index e686a02f29b..9053d9a52c8 100644
--- library/alloc/src/alloc.rs
+++ library/alloc/src/alloc.rs
@@ -91,7 +91,7 @@ pub unsafe fn alloc(layout: Layout) -> *mut u8 {
unsafe {
// Make sure we don't accidentally allow omitting the allocator shim in
// stable code until it is actually stabilized.
- core::ptr::read_volatile(&__rust_no_alloc_shim_is_unstable);
+ //core::ptr::read_volatile(&__rust_no_alloc_shim_is_unstable);
__rust_alloc(layout.size(), layout.align())
}
@@ -174,7 +174,7 @@ pub unsafe fn alloc_zeroed(layout: Layout) -> *mut u8 {
unsafe {
// Make sure we don't accidentally allow omitting the allocator shim in
// stable code until it is actually stabilized.
- core::ptr::read_volatile(&__rust_no_alloc_shim_is_unstable);
+ //core::ptr::read_volatile(&__rust_no_alloc_shim_is_unstable);
__rust_alloc_zeroed(layout.size(), layout.align())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment