Last active
November 18, 2022 08:48
-
-
Save lundman/f322b5be05d2c016feb5cc0df3ccd6be to your computer and use it in GitHub Desktop.
inline assembly to call sysv_abi from ms_abi
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Update 3 - experimenting with explicit first, all args look correct */ | |
void XXXzfs_blake3_compress_xof_sse2(const uint32_t cv[8], | |
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, | |
uint64_t counter, uint8_t flags, uint8_t out[64]) | |
{ | |
long ret; | |
__asm__ ( | |
"mov %[p6], %%r9 \n\t" | |
"mov %[p5], %%r8 \n\t" | |
"mov %[p4], %%rcx \n\t" | |
"mov %[p3], %%rdx \n\t" | |
"mov %[p2], %%rsi \n\t" | |
"mov %[p1], %%rdi \n\t" | |
"call zfs_blake3_compress_xof_sse2 \n\t" | |
: "=a"(ret) | |
: [p1]"m"(cv), [p2]"m"(block), [p3]"m"(block_len), [p4]"m"(counter), [p5]"m"(flags), [p6]"m"(out) | |
: "%xmm6" | |
) | |
} | |
/* Although r8 has 0x1a only in final byte, might need extending. */ | |
/* ****************************************** */ | |
/* Update 2 - r8,r9 was specified as output registers, not input */ | |
void XXXzfs_blake3_compress_xof_sse2(const uint32_t cv[8], | |
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, | |
uint64_t counter, uint8_t flags, uint8_t out[64]) | |
{ | |
int foo = 10, bar = 15; | |
// RDI, RSI, RDX, RCX, R8, R9 | |
register long long _r8 __asm__("r8") = flags; | |
register long long _r9 __asm__("r9") = out; | |
__asm__ __volatile__( | |
"call zfs_blake3_compress_xof_sse2 \n\t" | |
: | |
: "D"(cv), "S"(block), "d"(block_len), "c"(counter), | |
"r"(_r8), "r"(_r9) | |
: | |
); | |
} | |
/* | |
* we want this: | |
0xffffe801`3816b520 rdi | |
0xffffe801`3816b548 rsi | |
0x40 rdx | |
0x0 rcx | |
0x1a r8 | |
0xffffe801`3816b280 r9 | |
* we get this: | |
rax=ffffe8013816b280 rbx=0000000000000000 rcx=0000000000000000 | |
rdx=ffffe8013816b540 rsi=ffffe8013816b548 rdi=ffffe8013816b520 | |
rip=fffff80730829e80 rsp=ffffe8013816b158 rbp=0000000000000000 | |
r8=000000000000001a r9=ffffe8013816b280 r10=ffffe8013816b21a | |
r11=ffffe8013816b51a r12=ffff8a89c47ec2d0 r13=ffffffff80002e4c | |
r14=fffff80731d4daf8 r15=ffffd78393d68e30 | |
Now rdi, rsi, "rcx", r8, r9 are all correct | |
rdx is not 0x40 though, otherwise close. | |
*/ | |
/* ******************************** */ | |
void WRAPPER_blake3_compress_xof_sse2(const uint32_t cv[8], | |
const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, | |
uint64_t counter, uint8_t flags, uint8_t out[64]) | |
{ | |
// RDI, RSI, RDX, RCX, R8, R9 | |
register long long _r8 __asm__("r8") = flags; | |
register long long _r9 __asm__("r9") = out; | |
__asm__ __volatile__( | |
"call zfs_blake3_compress_xof_sse2 \n\t" | |
:"=r"(_r8), "=r"(_r9) | |
: "D"(cv), "S"(block), "d"(block_len), "c"(counter) | |
: /* add clobbers */ | |
); | |
} | |
/* | |
As called to the WRAPPER. | |
0xffffe801`38e36520 cv -> rdi | |
0xffffe801`38e36548 block -> rsi | |
0x40 block_len -> rdx | |
0x0 counter -> rcx | |
0x1a flags -> r8 | |
0xffffe801`38e36280 out -> r9 | |
Registers as we enter the sysv function | |
rax=ffffe80138e36280 rbx=0000000000000000 rcx=0000000000000000 | |
rdx=ffffe80138e36540 rsi=ffffe80138e36548 rdi=ffffe80138e36520 | |
rip=fffff80730359e80 rsp=ffffe80138e36158 rbp=0000000000000000 | |
r8=0000000000000040 r9=0000000000000000 r10=ffffe80138e3621a | |
r11=ffffe80138e3651a r12=ffff8a89d1119a70 r13=ffffffff80003ed0 | |
r14=fffff8073187daf8 r15=ffffd78395a7c2b0 | |
Seems rdi, rsi as set correctly! | |
r8, r9 are P3 and P4 for some reason? when we wanted P5 and P6 there. | |
No clobbers yet, wanted to see which I need to add when it works (or is that a bad idea) | |
This one is void, will have to handle that rax is already the return value, for other functions. | |
Should it be volatile? | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment