Skip to content

Instantly share code, notes, and snippets.

@lundman
Last active November 18, 2022 08:48
Show Gist options
  • Save lundman/f322b5be05d2c016feb5cc0df3ccd6be to your computer and use it in GitHub Desktop.
Save lundman/f322b5be05d2c016feb5cc0df3ccd6be to your computer and use it in GitHub Desktop.
inline assembly to call sysv_abi from ms_abi
/* Update 3 - experimenting with explicit first, all args look correct */
void XXXzfs_blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags, uint8_t out[64])
{
long ret;
__asm__ (
"mov %[p6], %%r9 \n\t"
"mov %[p5], %%r8 \n\t"
"mov %[p4], %%rcx \n\t"
"mov %[p3], %%rdx \n\t"
"mov %[p2], %%rsi \n\t"
"mov %[p1], %%rdi \n\t"
"call zfs_blake3_compress_xof_sse2 \n\t"
: "=a"(ret)
: [p1]"m"(cv), [p2]"m"(block), [p3]"m"(block_len), [p4]"m"(counter), [p5]"m"(flags), [p6]"m"(out)
: "%xmm6"
)
}
/* Although r8 has 0x1a only in final byte, might need extending. */
/* ****************************************** */
/* Update 2 - r8,r9 was specified as output registers, not input */
void XXXzfs_blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags, uint8_t out[64])
{
int foo = 10, bar = 15;
// RDI, RSI, RDX, RCX, R8, R9
register long long _r8 __asm__("r8") = flags;
register long long _r9 __asm__("r9") = out;
__asm__ __volatile__(
"call zfs_blake3_compress_xof_sse2 \n\t"
:
: "D"(cv), "S"(block), "d"(block_len), "c"(counter),
"r"(_r8), "r"(_r9)
:
);
}
/*
* we want this:
0xffffe801`3816b520 rdi
0xffffe801`3816b548 rsi
0x40 rdx
0x0 rcx
0x1a r8
0xffffe801`3816b280 r9
* we get this:
rax=ffffe8013816b280 rbx=0000000000000000 rcx=0000000000000000
rdx=ffffe8013816b540 rsi=ffffe8013816b548 rdi=ffffe8013816b520
rip=fffff80730829e80 rsp=ffffe8013816b158 rbp=0000000000000000
r8=000000000000001a r9=ffffe8013816b280 r10=ffffe8013816b21a
r11=ffffe8013816b51a r12=ffff8a89c47ec2d0 r13=ffffffff80002e4c
r14=fffff80731d4daf8 r15=ffffd78393d68e30
Now rdi, rsi, "rcx", r8, r9 are all correct
rdx is not 0x40 though, otherwise close.
*/
/* ******************************** */
void WRAPPER_blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
uint64_t counter, uint8_t flags, uint8_t out[64])
{
// RDI, RSI, RDX, RCX, R8, R9
register long long _r8 __asm__("r8") = flags;
register long long _r9 __asm__("r9") = out;
__asm__ __volatile__(
"call zfs_blake3_compress_xof_sse2 \n\t"
:"=r"(_r8), "=r"(_r9)
: "D"(cv), "S"(block), "d"(block_len), "c"(counter)
: /* add clobbers */
);
}
/*
As called to the WRAPPER.
0xffffe801`38e36520 cv -> rdi
0xffffe801`38e36548 block -> rsi
0x40 block_len -> rdx
0x0 counter -> rcx
0x1a flags -> r8
0xffffe801`38e36280 out -> r9
Registers as we enter the sysv function
rax=ffffe80138e36280 rbx=0000000000000000 rcx=0000000000000000
rdx=ffffe80138e36540 rsi=ffffe80138e36548 rdi=ffffe80138e36520
rip=fffff80730359e80 rsp=ffffe80138e36158 rbp=0000000000000000
r8=0000000000000040 r9=0000000000000000 r10=ffffe80138e3621a
r11=ffffe80138e3651a r12=ffff8a89d1119a70 r13=ffffffff80003ed0
r14=fffff8073187daf8 r15=ffffd78395a7c2b0
Seems rdi, rsi as set correctly!
r8, r9 are P3 and P4 for some reason? when we wanted P5 and P6 there.
No clobbers yet, wanted to see which I need to add when it works (or is that a bad idea)
This one is void, will have to handle that rax is already the return value, for other functions.
Should it be volatile?
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment