Skip to content

Instantly share code, notes, and snippets.

@rrbutani
Last active January 8, 2021 06:23
Show Gist options
  • Save rrbutani/64614f4527a29f28eccd9ae5dcadd000 to your computer and use it in GitHub Desktop.
Save rrbutani/64614f4527a29f28eccd9ae5dcadd000 to your computer and use it in GitHub Desktop.
varargs
// runtime-length-varargs.c
#include <stddef.h>
#include <stdint.h>
#include <stdarg.h>
#include <stdio.h>
// In C:
int foo(size_t len, ...) {
va_list args;
va_start(args, len);
printf("len: %zu\n", len);
for (int i = 0; i < len; i++) {
printf("%d ", va_arg(args, int));
}
printf("\n");
return len;
}
#ifndef __x86_64__
#error "These hacks are x86_64 specific; sorry!"
#endif
// Note: we're assuming that the target function (`foo`) uses the SYS-V ABI
// calling convention. If it does not the below will probably result in a
// corrupted stack.
__attribute__((noinline))
int abi_crimes(size_t len, const int* buf) {
// The SYS-V ABI (for x86_64) will pass int sized args in these registers
// before using the stack:
// - rdi (used for len)
// - rsi
// - rdx
// - rcx
// - r8d
// - r9d
// I'm sure it's possible to be more clever than this (i.e. don't run the
// instructions that clobber registers that we don't need to use) but this
// will do for now.
long rsi = (len >= 1) ? buf[0] : 0,
rdx = (len >= 2) ? buf[1] : 0,
rcx = (len >= 3) ? buf[2] : 0,
r8 = (len >= 4) ? buf[3] : 0,
r9 = (len >= 5) ? buf[4] : 0;
const size_t NUM_REG_PASSED_ARGS = 5;
// GCC supports VLAs in structs (inside functions)!
//
// Not that we needed more reasons _not_ to do this but Clang hates this:
// » variable length array in structure' extension will never be supported
__attribute((packed))
struct stack_frame {
// Gotta pad to a multiple of 16 bytes; that means len needs to be
// rounded up to the next multiple of 2:
//
// We subtract 5 from len so that we skip things we're going to pass via
// the stack.
//
// 8 bytes per val for the SYS-V ABI so we use `long`:
long arr[((len - NUM_REG_PASSED_ARGS) + 1) & (~1)];
};
// the SYS-V ABI calls for a 16 byte alignment for the stack pointer
__attribute__ ((aligned (16))) struct stack_frame frame;
for (int i = NUM_REG_PASSED_ARGS; i < len; i++) {
frame.arr[i - NUM_REG_PASSED_ARGS] = buf[i];
}
uint64_t backup_sp;
void* stack_frame_ptr = &frame;
int ret;
// This is dangerous for a lot of reasons but one of them is that we're
// assuming that there's nothing after `frame` on our stack that's of any
// value.
//
// We have no real way to make sure the compiler won't (for example) stick
// the local variable `i` after frame on our stack but since we've asked to
// never be inlined this is okay; we can guarantee that we will never access
// anything on our own stack frame after calling the inner function.
asm (
"mov %[len], %%rdi\n\t"
"mov %[rsi], %%rsi\n\t"
"mov %[rdx], %%rdx\n\t"
"mov %[rcx], %%rcx\n\t"
"mov %[r8], %%r8\n\t"
"mov %[r9], %%r9\n\t"
"mov %%rsp, %[backup_sp]\n\t"
"mov %[frame], %%rsp\n\t"
"call foo\n\t"
"mov %[backup_sp], %%rsp\n\t"
// backup_sp is early clobber
: [backup_sp] "=&r" (backup_sp)
, [ret] "=a" (ret)
: [len] "g" (len) // we put this in rdi
, [frame] "g" (stack_frame_ptr)
, [rsi] "g" (rsi)
, [rdx] "g" (rdx)
, [rcx] "g" (rcx)
, [r8] "g" (r8)
, [r9] "g" (r9)
: "rdi"
// From the SYS-V ABI:
, "rsi", "rdx", "rcx", "r8", "r9", "r10", "r11"
);
return ret;
}
// If you just pass in `struct args` gcc produces code that doesn't do the right
// thing for the args that are passed in registers (i.e. the corresponding
// registers just aren't set correctly before `foo` is called; instead the
// entirety of `args` is put on the stack).
//
// Not completely sure but I think the SYS-V ABI says that if you try to pass a
// struct by value, its fields can be put in registers; the entire thing doesn't
// have to be passed at once on the stack (which is what happens here). Maybe it
// has something to do with the array?
//
// Update: seems like it. https://godbolt.org/z/13snMj
//
// In any case, we just go ahead and handle the first 5 args ourselves.
const size_t NUM_REG_PASSED_ARGS = 5;
// Very tightly coupled to `less_criminal_but_probably_still_ub` and very likely
// to break.
__attribute((naked))
// not specifying void in the arg list is intentional; we can't list the actual
// args since gcc will (annoyingly) go generate a prelude for them if we do even
// though the point of a "naked" function is that it won't
int foo_wrapped() {
// this is a little wasteful; we could patch the return value and use a
// jump here instead of a call but it's fine
//
// `less_criminal_but_probably_still_ub` pushes 3 registers after the actual
// stack params; we've got to save these somewhere:
asm(
// The pointer to the backup location is _after_ the args.
//
// So we need to get len, multiply it by 8, add it to 0x18 and the stack
// pointer and then the address at _that_ location will be our backup
// location.
// i.e.: `*(%rsp + 0x18 + 8 * len)`
// "mov 0x18(%rsp,%rdi,8), %r10\t\n"
//
// Actually it's: `*(%rsp + 0x18 + 8 * max((len - 5), 0))`
//
// len, the first arg, is stored in rdi
"xor %%r10, %%r10\t\n"
"mov %%rdi, %%rax\t\n"
"sub %[N], %%rax\t\n"
"cmovs %%r10, %%rax\t\n" // Not sure why the `cmov 0, %rax` form segfaults...
"mov 0x18(%%rsp,%%rax,8), %%r10\t\n"
// Now we can store the registers:
"pop %%rax\n\t"
"mov %%rax, 0x00(%%r10)\n\t"
"pop %%rax\n\t"
"mov %%rax, 0x08(%%r10)\n\t"
"pop %%rax\n\t"
"mov %%rax, 0x10(%%r10)\n\t"
// And then, call foo:
"call foo\n\t"
// Finally, we want to push those three things back on the stack and
// return.
//
// Since `foo` could have clobbered r10 and definitely clobbered rax
// (return value) we need to reconstruct our pointer to the backup
// location.
//
// Note that we can't assume that `foo` did not clobber `rdi` (len) so
// we're using the fact that `foo` returns `len` to get it from `rax`.
//
// We could maybe use the frame pointer so that we don't have to rely on
// the function we're passing to returning us the length (but then we
// maybe break with -fomit-frame-pointer).
"xor %%r10, %%r10\n\t"
"mov %%rax, %%rdi\n\t"
"sub %[N], %%rdi\n\t"
"cmovs %%r10, %%rdi\n\t"
"mov 0(%%rsp,%%rdi,8), %%r10\n\t"
"mov 0x10(%%r10), %%rdi\n\t"
"push %%rdi\n\t"
"mov 0x08(%%r10), %%rdi\n\t"
"push %%rdi\n\t"
"mov 0x00(%%r10), %%rdi\n\t"
"push %%rdi\n\t"
"ret\n\t"
:
: [N] "N" (NUM_REG_PASSED_ARGS)
);
}
int less_criminal_but_probably_actually_worse(size_t len, const int* buf) {
const size_t NUM_REG_PASSED_ARGS = 5;
// `gcc` doesn't subtract the stack pointer by the right amount before
// calling `foo`. Maybe for alignment reasons?
//
// Still seems like a bug though; two garbage params end up being passed to
// `foo` on the stack.
//
// Actually I think it's that the code that gcc emits to copy the stuff to
// the stack doesn't communicate with the register allocator; it doesn't
// seem to know that storing things to the stack after copying over `args`
// (it literally invokes memcpy) will mess up the stack pointer.
struct args {
size_t buf[(len >= NUM_REG_PASSED_ARGS) ? (len - NUM_REG_PASSED_ARGS) : 0];
size_t* backup;
};
size_t backup[3] = { 0xDEADBEEF, 0xDEADBEEF, 0xDEADBEEF };
struct args args;
args.backup = backup;
for (int i = NUM_REG_PASSED_ARGS; i < len; i++)
args.buf[i - NUM_REG_PASSED_ARGS] = buf[i];
return foo_wrapped(
len,
(len >= 1) ? buf[0] : 0,
(len >= 2) ? buf[1] : 0,
(len >= 3) ? buf[2] : 0,
(len >= 4) ? buf[3] : 0,
(len >= 5) ? buf[4] : 0,
args
);
}
// runtime-length-varargs.cc
#include <array>
#include <cstdint>
#include <type_traits>
#include <vector>
extern "C" int abi_crimes(std::size_t len, const int* buf);
extern "C" int less_criminal_but_probably_actually_worse(std::size_t len, const int* buf);
template <
typename Func,
typename Arr
// typename std::enable_if_t<
// std::is_same<
// int,
// decltype(std::declval<Arr>()[0])
// >::value
// >* = nullptr
>
inline int foo_forwarder(Func func, const Arr& a) {
return func(a.size(), a.data());
}
// uses `abi_crimes`
template <typename Arr>
int foo_1(const Arr& a) {
return foo_forwarder(abi_crimes, a);
}
// uses `less_criminal_but_probably_actually_worse`
template <typename Arr>
int foo_2(const Arr& a) {
return foo_forwarder(less_criminal_but_probably_actually_worse, a);
}
#define LEN 30
#define foo foo_2
int main() {
std::array<int, 12> arr = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
foo(arr);
for (int i = 0; i < LEN; i++) {
std::vector<int> v (i);
for (int j = 0; j < i; j++) v[j] = j;
foo(v);
}
}
// If the length is known at compile-time:
#include <cassert>
#include <cstdarg>
#include <iostream>
#include <utility>
#include <vector>
// In C:
inline void foo_inner(int len, ...) {
va_list args;
va_start(args, len);
std::cout << "len: " << len << std::endl;
for (int i = 0; i < len; i++) {
int next = va_arg(args, int);
std::cout << next << " ";
}
std::cout << std::endl;
}
// Downside is you need wrapper functions:
void foo_ten(int arr[10]) {
foo_inner(10, arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7], arr[8], arr[9]);
}
// But we can kind of generate this with some template foo:
// (mostly lifted from https://stackoverflow.com/a/34994591)
template <std::size_t N, typename F, typename T, std::size_t... Idxes>
inline auto splat_inner(F func, const T* arr, std::index_sequence<Idxes...>) {
return func(N, arr[Idxes]...);
}
// for vectors:
template <std::size_t N, typename F, typename T>
inline auto splat(F func, const std::vector<T>& v) {
assert(v.size() >= N);
// This is illegal for vector specializations like vector<bool>.
return splat_inner<N>(func, v.data(), std::make_index_sequence<N>());
}
// for arrays:
template <std::size_t N, typename F, typename T>
inline auto splat(F func, const T (&arr)[N]) {
return splat_inner<N>(func, &arr[0], std::make_index_sequence<N>());
}
// Wrapper:
template <std::size_t N, typename Arr = int(&)[N]>
void foo(const Arr& arr) {
return splat<N, decltype(foo_inner), int>(foo_inner, arr);
}
int main() {
foo<4>((int[4]){1, 2, 3, 4});
std::vector<int> v = { 10, 9, 8, 7, 6, 5 };
foo<6>(v);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment