Last active
August 27, 2024 17:16
-
-
Save cynecx/c9e3c6547b6abf41f6768113eec9d930 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #![no_std] | |
| #![feature(link_llvm_intrinsics)] | |
| extern "C" { | |
| #[link_name = "llvm.memcpy.inline.p0.p0.i64"] | |
| fn llvm_memcpy_inline(dst: *mut (), src: *const (), len: usize, is_volatile: bool); | |
| } | |
| #[inline(always)] | |
| unsafe fn memcpy_inline<const N: usize>(dst: *mut (), src: *const ()) { | |
| llvm_memcpy_inline( | |
| dst, | |
| src, | |
| N, | |
| true | |
| ); | |
| } | |
| #[inline(always)] | |
| const fn align_up<const ALIGN: usize>(num: usize) -> usize { | |
| (num + (ALIGN - 1)) & !(ALIGN - 1) | |
| } | |
| macro_rules! blocks { | |
| ($dst:ident, $src:ident, $len:ident, $($op:ident $num:literal),+) => { | |
| $( | |
| blocks!(@copy $op, $dst, $src, $len, $num); | |
| )+ | |
| }; | |
| (@copy repeat, $dst:ident, $src:ident, $len:ident, $num:literal) => { | |
| while $len >= $num { | |
| unsafe { | |
| memcpy_inline::<$num>($dst, $src); | |
| $len = $len.wrapping_sub($num); | |
| $src = $src.wrapping_byte_add($num); | |
| $dst = $dst.wrapping_byte_add($num); | |
| // Just a way to avoid auto-vectorization. | |
| core::arch::asm!( | |
| "/* {0} */", | |
| inout(reg) $len => $len, | |
| options(nomem, nostack, preserves_flags) | |
| ); | |
| } | |
| } | |
| $src = $src.wrapping_byte_add($len).wrapping_byte_sub($num); | |
| $dst = $dst.wrapping_byte_add($len).wrapping_byte_sub($num); | |
| memcpy_inline::<$num>($dst, $src); | |
| return; | |
| }; | |
| (@copy branchless2x, $dst:ident, $src:ident, $len:ident, $num:literal) => { | |
| if $num <= $len && $len < $num * 2 { | |
| unsafe { | |
| memcpy_inline::<$num>( | |
| $dst, | |
| $src, | |
| ); | |
| memcpy_inline::<$num>( | |
| $dst.wrapping_byte_add($len).wrapping_byte_sub($num), | |
| $src.wrapping_byte_add($len).wrapping_byte_sub($num), | |
| ); | |
| } | |
| return; | |
| } | |
| }; | |
| (@copy eq, $dst:ident, $src:ident, $len:ident, $num:literal) => { | |
| if $num == $len { | |
| unsafe { | |
| memcpy_inline::<$num>( | |
| $dst, | |
| $src, | |
| ); | |
| } | |
| return; | |
| } | |
| }; | |
| (@copy align, $dst:ident, $src:ident, $len:ident, $num:literal) => { | |
| if $len >= $num { | |
| unsafe { | |
| memcpy_inline::<$num>( | |
| $dst, | |
| $src, | |
| ); | |
| } | |
| let offset = align_up::<$num>($dst as usize) - $dst as usize; | |
| $dst = $dst.wrapping_byte_add(offset); | |
| $src = $src.wrapping_byte_add(offset); | |
| $len = $len.wrapping_sub(offset); | |
| } | |
| } | |
| } | |
| pub unsafe extern "C" fn memcpy_skylake(mut dst: *mut (), mut src: *const (), mut len: usize) { | |
| if len == 0 { | |
| return; | |
| } | |
| // This implementation isn't quite ideal with large copies (eg. > L3-Size) because it might pollute the cpu's cache. | |
| // It is quite common for optimized memcpy implementations to switch over to non-temporal stores when a threshold has been | |
| // reached. | |
| blocks!( | |
| dst, src, len, | |
| eq 1, eq 2, eq 3, eq 4, | |
| branchless2x 4, | |
| branchless2x 8, | |
| branchless2x 16, | |
| branchless2x 32, | |
| branchless2x 64, | |
| branchless2x 128, | |
| align 32, repeat 32 | |
| ); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment