Arnavion · March 29, 2025 08:43
diff --git a/01_pointer_read_unaligned.rs b/01_pointer_read_unaligned.rs
 //! <https://rust.godbolt.org/z/z78Gh8j47>

 /// # Safety
 ///
 /// `ptr` must point to an allocation that contains at least `size_of::<u64>()` bytes
 /// at `ptr`'s address.
 #[inline(never)]
 pub unsafe fn core_read_unaligned(ptr: *const u8) -> u64 {
    // With `target-feature+unaligned-scalar-mem` this does a single `ld` as desired.
    // Without that feature it falls back to loading each byte one-by-one and shifting them into place,
    // so the custom impl below ends up being better.
    //
    // The standard feature `target-feature=+zicclsm` (hardware supports misaligned loads and stores)
    // does not make a difference either way.
    unsafe { ptr.cast::<u64>().read_unaligned() }
 }

 /// # Safety
 ///
 /// `ptr` must point to an allocation that contains at least `size_of::<u64>()` bytes
 /// at `ptr`'s address.
 #[inline(never)]
 pub unsafe fn custom_read_unaligned(ptr: *const u8) -> u64 {
    let (first_ptr, second_ptr, shamt) = unsafe { custom_read_unaligned_inner(ptr) };
    unsafe {
        // Both pointers are guaranteed to point to valid u64-sized memory within
        // our address space (assuming the caller satisfied our safety critera).
        // We can load them, then shift the two u64s to keep just the parts we care about,
        // then OR them to get the result.
        //
        // However dereferencing the pointers in Rust code might read partially outside
        // the allocation that `ptr` was from, which is UB. The official alternative is
        // to use inline assembly to dereference the pointers instead.
        // This alternative is also used by compiler_builtins' x86_64 SWAR impl of `strlen`,
        // for the same reason.
        //
        // This code ends up compiling to a functionally equivalent version of
        // <https://old.reddit.com/r/RISCV/comments/1ezbyr4/performance_of_misaligned_loads/ljkbx95/>
        // As mentioned there, it is possible to do only one load by checking
        // if the pointer is already aligned, but doing two loads is also cheap and
        // avoids branching.

        let first: u64;
        core::arch::asm!(
            "ld {value}, ({ptr})",
            ptr = in(reg) first_ptr,
            value = lateout(reg) first,
        );

        let second: u64;
        core::arch::asm!(
            "ld {value}, ({ptr})",
            ptr = in(reg) second_ptr,
            value = lateout(reg) second,
        );

        if cfg!(target_endian = "little") {
            // Eg shamt == 2 * 8
            //    __abcdef:gh______
            // -> (fedcba__, ______hg)
            // -> hgfedcba
            (second << ((64 - shamt) % 64)) | (first >> shamt)
        }
        else {
            // Eg shamt == 2 * 8
            //    __abcdef:gh______
            // -> (__abcdef, gh______)
            // -> abcdefgh
            (first << shamt) | (second >> ((64 - shamt) % 64))
        }
    }
 }

 #[inline(always)]
 unsafe fn custom_read_unaligned_inner(ptr: *const u8) -> (*const u64, *const u64, usize) {
    unsafe {
        let shamt = ptr.addr() % core::mem::size_of::<u64>();
        let first_ptr = ptr.byte_sub(shamt);
        let second_ptr = ptr.map_addr(|addr| addr.next_multiple_of(core::mem::size_of::<u64>()));
        (first_ptr.cast::<u64>(), second_ptr.cast::<u64>(), shamt * 8)
    }
 }

 #[cfg(test)]
 mod tests {
    use super::custom_read_unaligned_inner;

    #[test]
    fn test_custom_read_unaligned_inner() {
        let ptr: *const u8 = 8_usize as _;
        let result = unsafe { custom_read_unaligned_inner(ptr) };
        assert_eq!(result.0.addr(), 8);
        assert_eq!(result.1.addr(), 8);
        assert_eq!(result.2, 0 * 8);

        let ptr: *const u8 = 11_usize as _;
        let result = unsafe { custom_read_unaligned_inner(ptr) };
        assert_eq!(result.0.addr(), 8);
        assert_eq!(result.1.addr(), 16);
        assert_eq!(result.2, 3 * 8);
    }
 }
	//! <https://rust.godbolt.org/z/z78Gh8j47>

	/// # Safety
	///
	/// `ptr` must point to an allocation that contains at least `size_of::<u64>()` bytes
	/// at `ptr`'s address.
	#[inline(never)]
	pub unsafe fn core_read_unaligned(ptr: *const u8) -> u64 {
	// With `target-feature+unaligned-scalar-mem` this does a single `ld` as desired.
	// Without that feature it falls back to loading each byte one-by-one and shifting them into place,
	// so the custom impl below ends up being better.
	//
	// The standard feature `target-feature=+zicclsm` (hardware supports misaligned loads and stores)
	// does not make a difference either way.
	unsafe { ptr.cast::<u64>().read_unaligned() }
	}

	/// # Safety
	///
	/// `ptr` must point to an allocation that contains at least `size_of::<u64>()` bytes
	/// at `ptr`'s address.
	#[inline(never)]
	pub unsafe fn custom_read_unaligned(ptr: *const u8) -> u64 {
	let (first_ptr, second_ptr, shamt) = unsafe { custom_read_unaligned_inner(ptr) };
	unsafe {
	// Both pointers are guaranteed to point to valid u64-sized memory within
	// our address space (assuming the caller satisfied our safety critera).
	// We can load them, then shift the two u64s to keep just the parts we care about,
	// then OR them to get the result.
	//
	// However dereferencing the pointers in Rust code might read partially outside
	// the allocation that `ptr` was from, which is UB. The official alternative is
	// to use inline assembly to dereference the pointers instead.
	// This alternative is also used by compiler_builtins' x86_64 SWAR impl of `strlen`,
	// for the same reason.
	//
	// This code ends up compiling to a functionally equivalent version of
	// <https://old.reddit.com/r/RISCV/comments/1ezbyr4/performance_of_misaligned_loads/ljkbx95/>
	// As mentioned there, it is possible to do only one load by checking
	// if the pointer is already aligned, but doing two loads is also cheap and
	// avoids branching.

	let first: u64;
	core::arch::asm!(
	"ld {value}, ({ptr})",
	ptr = in(reg) first_ptr,
	value = lateout(reg) first,
	);

	let second: u64;
	core::arch::asm!(
	"ld {value}, ({ptr})",
	ptr = in(reg) second_ptr,
	value = lateout(reg) second,
	);

	if cfg!(target_endian = "little") {
	// Eg shamt == 2 * 8
	// __abcdef:gh______
	// -> (fedcba__, ______hg)
	// -> hgfedcba
	(second << ((64 - shamt) % 64)) \| (first >> shamt)
	}
	else {
	// Eg shamt == 2 * 8
	// __abcdef:gh______
	// -> (__abcdef, gh______)
	// -> abcdefgh
	(first << shamt) \| (second >> ((64 - shamt) % 64))
	}
	}
	}

	#[inline(always)]
	unsafe fn custom_read_unaligned_inner(ptr: const u8) -> (const u64, *const u64, usize) {
	unsafe {
	let shamt = ptr.addr() % core::mem::size_of::<u64>();
	let first_ptr = ptr.byte_sub(shamt);
	let second_ptr = ptr.map_addr(\|addr\| addr.next_multiple_of(core::mem::size_of::<u64>()));
	(first_ptr.cast::<u64>(), second_ptr.cast::<u64>(), shamt * 8)
	}
	}

	#[cfg(test)]
	mod tests {
	use super::custom_read_unaligned_inner;

	#[test]
	fn test_custom_read_unaligned_inner() {
	let ptr: *const u8 = 8_usize as _;
	let result = unsafe { custom_read_unaligned_inner(ptr) };
	assert_eq!(result.0.addr(), 8);
	assert_eq!(result.1.addr(), 8);
	assert_eq!(result.2, 0 * 8);

	let ptr: *const u8 = 11_usize as _;
	let result = unsafe { custom_read_unaligned_inner(ptr) };
	assert_eq!(result.0.addr(), 8);
	assert_eq!(result.1.addr(), 16);
	assert_eq!(result.2, 3 * 8);
	}
	}