-
-
Save marsgpl/ae4d20f976cc79617709d98fad10736b to your computer and use it in GitHub Desktop.
lightweight thread-friendly coroutines for arm64 and x86_64
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <inttypes.h> | |
// log stub | |
#include <stdio.h> | |
#include <errno.h> | |
#include <string.h> | |
#define log_warning(fmt, ...) { \ | |
fprintf(stderr, "Warning: " fmt "\n" __VA_OPT__(,) __VA_ARGS__); \ | |
} | |
#define log_error(fmt, ...) { \ | |
fprintf(stderr, "Error: " fmt "\n" __VA_OPT__(,) __VA_ARGS__); \ | |
exit(1); \ | |
} | |
#define log_error_errno(fmt, ...) { \ | |
fprintf(stderr, "Error: " fmt "; errno: %s (%d)\n" \ | |
__VA_OPT__(,) __VA_ARGS__, strerror(errno), errno); \ | |
exit(1); \ | |
} | |
// end of log stub | |
// adjust to your estimated stack usage | |
// keep no less than a page size (2-4k) | |
#define CO_STACK_SIZE (1024 * 20) | |
// end of the stack is filled with canary seq to detect overflow on co_yield | |
#define CO_RED_ZONE 256 // bytes | |
#define CO_CANARY 0xDEADBEEFCAFEBABE // 8 bytes | |
typedef enum co_state { | |
CO_STATE_DEAD = 0, | |
CO_STATE_SUSPENDED = 1, | |
CO_STATE_RUNNING = 2, | |
} co_state; | |
#if defined(__x86_64__) || defined(__amd64__) | |
typedef struct co_ctx { | |
// callee-saved registers (much less than on arm64) | |
uint64_t rbp, rbx; | |
uint64_t r12, r13; | |
uint64_t r14, r15; | |
uint64_t rsp; | |
uint64_t _padding_1; | |
} __attribute__((packed)) co_ctx; | |
#define CO_SETUP_CONTEXT(co, fn) { \ | |
co->ctx.rbx = (uint64_t)(uintptr_t)&co->resume_value; \ | |
co->ctx.r12 = (uint64_t)(uintptr_t)fn; \ | |
co->ctx.rsp = \ | |
(uint64_t)((uintptr_t)(co->stack + CO_STACK_SIZE) & ~0xF); \ | |
co->ctx.rbp = 0; \ | |
co->ctx.r13 = 0; \ | |
co->ctx.r14 = 0; \ | |
co->ctx.r15 = 0; \ | |
*((uint64_t*)(co->ctx.rsp - 8)) = (uint64_t)(uintptr_t)co_start; \ | |
co->ctx.rsp -= 8; \ | |
} | |
#elif defined(__aarch64__) || defined(__arm64__) | |
typedef struct co_ctx { | |
// callee-saved general-purpose registers | |
uint64_t x19, x20, x21, x22; | |
uint64_t x23, x24, x25, x26; | |
uint64_t x27, x28, x29, x30; // 29=fp 30=lr | |
uint64_t sp; // stack pointer | |
uint64_t _padding_1; | |
// callee-saved floating point registers | |
double d8, d9, d10, d11; | |
double d12, d13, d14, d15; | |
uint64_t fpcr; // floating-point control register | |
uint32_t fpsr; // floating-point status register | |
uint32_t _padding_2; | |
} __attribute__((packed)) co_ctx; | |
#define CO_SETUP_CONTEXT(co, fn) { \ | |
co->ctx.x19 = (uint64_t)(uintptr_t)&co->resume_value; \ | |
co->ctx.x20 = (uint64_t)(uintptr_t)fn; \ | |
co->ctx.x30 = (uint64_t)(uintptr_t)co_start; \ | |
co->ctx.sp = \ | |
(uint64_t)((uintptr_t)(co->stack + CO_STACK_SIZE) & ~0xF); \ | |
} | |
#else | |
#error "Unsupported architecture. Only ARM64 and x86_64 are supported." | |
#endif | |
typedef struct co { | |
char stack[CO_STACK_SIZE]; | |
uint32_t id; | |
co_state state; | |
co_ctx ctx; | |
struct co *resumer; // NULL - resumed from the thread root | |
void *resume_value; | |
void *yield_value; | |
} co; | |
// global pointer to current thread's coroutine | |
__thread co *co_current = NULL; | |
// human-readable enumeration for convenient logging | |
__thread uint32_t co_next_id = 1; | |
void co_start(void); // defined as asm | |
void co_switch(co_ctx *from, co_ctx *to); // defined as asm | |
// setup red zone at the LOW END of stack (where overflow may occur) | |
// both x86_64 and arm64: stack grows downward toward co->stack[0] | |
// red zone goes at the beginning of the stack array (low addresses) | |
static void co_setup_red_zone(co *co) { | |
for (int i = 0; i < CO_RED_ZONE; i += 8) { | |
*(uint64_t*)(co->stack + i) = CO_CANARY; | |
} | |
} | |
static void co_check_red_zone(co *co) { | |
for (int i = 0; i < CO_RED_ZONE; i += 8) { | |
if (*(uint64_t*)(co->stack + i) != CO_CANARY) { | |
log_error("red zone stack overflow"); | |
} | |
} | |
} | |
// arg#1 - function to be run as coroutine | |
// arg#2 - pass NULL if you want co_create to allocate struct co for you | |
co *co_create(void (*fn)(void *), co *co) { | |
if (co == NULL) { | |
co = calloc(1, sizeof(co)); | |
if (co == NULL) { | |
log_error_errno("calloc failed"); | |
} | |
} | |
co->id = co_next_id++; | |
co->state = CO_STATE_SUSPENDED; | |
co->yield_value = NULL; | |
co->resume_value = NULL; | |
CO_SETUP_CONTEXT(co, fn); | |
if (co_next_id == 0) { | |
log_warning("co_create: uint32_t id overflow"); | |
co_next_id++; // 0 is reserved | |
} | |
co_setup_red_zone(co); | |
return co; | |
} | |
// void *value_from_co = co_resume(co, value_for_co); | |
// when coroutine yields, caller of co_resume will get value_from_co | |
// when coroutine resumes, coroutine fn will get value_for_co from co_yield | |
void *co_resume(co *co, void *value) { | |
if (co->state != CO_STATE_SUSPENDED) { | |
log_error("attempt to resume %s coroutine; co id: %u", | |
co->state == CO_STATE_DEAD ? "dead" : "running", | |
co->id); | |
} | |
co->state = CO_STATE_RUNNING; | |
co->resumer = co_current; | |
co->resume_value = value; | |
co_current = co; | |
co_switch(&co->resumer->ctx, &co->ctx); | |
return co->yield_value; | |
} | |
void *co_yield(void *value) { | |
co *co = co_current; | |
if (co->id == 0) { | |
log_error("cannot yield root thread"); | |
} | |
if (co->state != CO_STATE_RUNNING) { | |
log_error("attempt to yield %s coroutine; co id: %u", | |
co->state == CO_STATE_DEAD ? "dead" : "suspended", | |
co->id); | |
} | |
co_check_red_zone(co); | |
co->state = CO_STATE_SUSPENDED; | |
co->yield_value = value; | |
co_current = co->resumer; | |
co_switch(&co->ctx, &co->resumer->ctx); | |
return co->resume_value; | |
} | |
void co_finish(void) { | |
co *co = co_current; | |
co->state = CO_STATE_DEAD; | |
co->yield_value = NULL; | |
co_current = co->resumer; | |
co_switch(&co->ctx, &co->resumer->ctx); | |
// coroutine is dead at this point | |
// this line should never be reached since co_resume checks co->status | |
} | |
#if defined(__x86_64__) || defined(__amd64__) | |
// rdi - pointer to co_ctx *from | |
// rsi - pointer to co_ctx *to | |
__asm__( | |
".text\n" | |
".globl co_switch\n" | |
"co_switch:\n" | |
// save current context to struct | |
" movq %rbp, 0(%rdi)\n" // save rbp | |
" movq %rbx, 8(%rdi)\n" // save rbx | |
" movq %r12, 16(%rdi)\n" // save r12 | |
" movq %r13, 24(%rdi)\n" // save r13 | |
" movq %r14, 32(%rdi)\n" // save r14 | |
" movq %r15, 40(%rdi)\n" // save r15 | |
" movq %rsp, 48(%rdi)\n" // save rsp | |
// load new context from struct | |
" movq 0(%rsi), %rbp\n" // load rbp | |
" movq 8(%rsi), %rbx\n" // load rbx | |
" movq 16(%rsi), %r12\n" // load r12 | |
" movq 24(%rsi), %r13\n" // load r13 | |
" movq 32(%rsi), %r14\n" // load r14 | |
" movq 40(%rsi), %r15\n" // load r15 | |
" movq 48(%rsi), %rsp\n" // load rsp | |
" ret\n" // return to address on top of stack | |
); | |
__asm__( | |
".text\n" | |
".globl co_start\n" | |
"co_start:\n" | |
// pass value from rbx as first arg to co fn (rdi) | |
" movq (%rbx), %rdi\n" | |
// call - indirect function call: jumps to the address stored in r12 | |
" call *%r12\n" | |
// when co fn returns, execution continues here | |
" call co_finish\n" | |
); | |
#elif defined(__aarch64__) || defined(__arm64__) | |
// x0 - pointer to co_ctx *from | |
// x1 - pointer to co_ctx *to | |
__asm__( | |
".text\n" | |
".globl co_switch\n" | |
"co_switch:\n" | |
// save current context - general purpose registers | |
" stp x19, x20, [x0, #(0*16)]\n" | |
" stp x21, x22, [x0, #(1*16)]\n" | |
" stp x23, x24, [x0, #(2*16)]\n" | |
" stp x25, x26, [x0, #(3*16)]\n" | |
" stp x27, x28, [x0, #(4*16)]\n" | |
" stp x29, x30, [x0, #(5*16)]\n" | |
" mov x2, sp\n" | |
" str x2, [x0, #(6*16)]\n" // save sp | |
// save floating-point registers | |
" stp d8, d9, [x0, #(7*16)]\n" | |
" stp d10, d11, [x0, #(8*16)]\n" | |
" stp d12, d13, [x0, #(9*16)]\n" | |
" stp d14, d15, [x0, #(10*16)]\n" | |
// save floating-point control and status registers | |
" mrs x2, fpcr\n" | |
" mrs x3, fpsr\n" | |
" stp x2, x3, [x0, #(11*16)]\n" | |
// load new context - general purpose registers | |
" ldp x19, x20, [x1, #(0*16)]\n" | |
" ldp x21, x22, [x1, #(1*16)]\n" | |
" ldp x23, x24, [x1, #(2*16)]\n" | |
" ldp x25, x26, [x1, #(3*16)]\n" | |
" ldp x27, x28, [x1, #(4*16)]\n" | |
" ldp x29, x30, [x1, #(5*16)]\n" | |
" ldr x2, [x1, #(6*16)]\n" // only load sp | |
" mov sp, x2\n" | |
// load floating-point registers | |
" ldp d8, d9, [x1, #(7*16)]\n" | |
" ldp d10, d11, [x1, #(8*16)]\n" | |
" ldp d12, d13, [x1, #(9*16)]\n" | |
" ldp d14, d15, [x1, #(10*16)]\n" | |
// load floating-point control and status registers | |
" ldp x2, x3, [x1, #(11*16)]\n" | |
" msr fpcr, x2\n" | |
" msr fpsr, x3\n" | |
" ret\n" | |
); | |
__asm__( | |
".text\n" | |
".globl co_start\n" | |
"co_start:\n" | |
// pass value from x19 as first arg to co fn (x0) | |
" ldr x0, [x19]\n" | |
// blr - branch with link to register | |
// indirect function call: jumps to the address stored in x20 and calls it | |
// saves the return address in x30 (lr) | |
// x20 contains co fn address (see co_create) | |
" blr x20\n" | |
// bl - branch with link | |
// direct function call: calls co_finish | |
// when co fn returns (finishes), execution continues here | |
" bl co_finish\n" | |
); | |
#else | |
#error "Unsupported architecture. Only ARM64 and x86_64 are supported." | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment