Skip to content

Instantly share code, notes, and snippets.

@cynecx
Last active August 27, 2024 17:16
Show Gist options
  • Select an option

  • Save cynecx/c9e3c6547b6abf41f6768113eec9d930 to your computer and use it in GitHub Desktop.

Select an option

Save cynecx/c9e3c6547b6abf41f6768113eec9d930 to your computer and use it in GitHub Desktop.
#![no_std]
#![feature(link_llvm_intrinsics)]
extern "C" {
#[link_name = "llvm.memcpy.inline.p0.p0.i64"]
fn llvm_memcpy_inline(dst: *mut (), src: *const (), len: usize, is_volatile: bool);
}
#[inline(always)]
unsafe fn memcpy_inline<const N: usize>(dst: *mut (), src: *const ()) {
llvm_memcpy_inline(
dst,
src,
N,
true
);
}
#[inline(always)]
const fn align_up<const ALIGN: usize>(num: usize) -> usize {
(num + (ALIGN - 1)) & !(ALIGN - 1)
}
macro_rules! blocks {
($dst:ident, $src:ident, $len:ident, $($op:ident $num:literal),+) => {
$(
blocks!(@copy $op, $dst, $src, $len, $num);
)+
};
(@copy repeat, $dst:ident, $src:ident, $len:ident, $num:literal) => {
while $len >= $num {
unsafe {
memcpy_inline::<$num>($dst, $src);
$len = $len.wrapping_sub($num);
$src = $src.wrapping_byte_add($num);
$dst = $dst.wrapping_byte_add($num);
// Just a way to avoid auto-vectorization.
core::arch::asm!(
"/* {0} */",
inout(reg) $len => $len,
options(nomem, nostack, preserves_flags)
);
}
}
$src = $src.wrapping_byte_add($len).wrapping_byte_sub($num);
$dst = $dst.wrapping_byte_add($len).wrapping_byte_sub($num);
memcpy_inline::<$num>($dst, $src);
return;
};
(@copy branchless2x, $dst:ident, $src:ident, $len:ident, $num:literal) => {
if $num <= $len && $len < $num * 2 {
unsafe {
memcpy_inline::<$num>(
$dst,
$src,
);
memcpy_inline::<$num>(
$dst.wrapping_byte_add($len).wrapping_byte_sub($num),
$src.wrapping_byte_add($len).wrapping_byte_sub($num),
);
}
return;
}
};
(@copy eq, $dst:ident, $src:ident, $len:ident, $num:literal) => {
if $num == $len {
unsafe {
memcpy_inline::<$num>(
$dst,
$src,
);
}
return;
}
};
(@copy align, $dst:ident, $src:ident, $len:ident, $num:literal) => {
if $len >= $num {
unsafe {
memcpy_inline::<$num>(
$dst,
$src,
);
}
let offset = align_up::<$num>($dst as usize) - $dst as usize;
$dst = $dst.wrapping_byte_add(offset);
$src = $src.wrapping_byte_add(offset);
$len = $len.wrapping_sub(offset);
}
}
}
pub unsafe extern "C" fn memcpy_skylake(mut dst: *mut (), mut src: *const (), mut len: usize) {
if len == 0 {
return;
}
// This implementation isn't quite ideal with large copies (eg. > L3-Size) because it might pollute the cpu's cache.
// It is quite common for optimized memcpy implementations to switch over to non-temporal stores when a threshold has been
// reached.
blocks!(
dst, src, len,
eq 1, eq 2, eq 3, eq 4,
branchless2x 4,
branchless2x 8,
branchless2x 16,
branchless2x 32,
branchless2x 64,
branchless2x 128,
align 32, repeat 32
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment