Skip to content

Instantly share code, notes, and snippets.

@marsgpl
Created August 3, 2025 13:09
Show Gist options
  • Save marsgpl/ae4d20f976cc79617709d98fad10736b to your computer and use it in GitHub Desktop.
Save marsgpl/ae4d20f976cc79617709d98fad10736b to your computer and use it in GitHub Desktop.
lightweight thread-friendly coroutines for arm64 and x86_64
#include <stdlib.h>
#include <inttypes.h>
// log stub
#include <stdio.h>
#include <errno.h>
#include <string.h>
#define log_warning(fmt, ...) { \
fprintf(stderr, "Warning: " fmt "\n" __VA_OPT__(,) __VA_ARGS__); \
}
#define log_error(fmt, ...) { \
fprintf(stderr, "Error: " fmt "\n" __VA_OPT__(,) __VA_ARGS__); \
exit(1); \
}
#define log_error_errno(fmt, ...) { \
fprintf(stderr, "Error: " fmt "; errno: %s (%d)\n" \
__VA_OPT__(,) __VA_ARGS__, strerror(errno), errno); \
exit(1); \
}
// end of log stub
// adjust to your estimated stack usage
// keep no less than a page size (2-4k)
#define CO_STACK_SIZE (1024 * 20)
// end of the stack is filled with canary seq to detect overflow on co_yield
#define CO_RED_ZONE 256 // bytes
#define CO_CANARY 0xDEADBEEFCAFEBABE // 8 bytes
typedef enum co_state {
CO_STATE_DEAD = 0,
CO_STATE_SUSPENDED = 1,
CO_STATE_RUNNING = 2,
} co_state;
#if defined(__x86_64__) || defined(__amd64__)
typedef struct co_ctx {
// callee-saved registers (much less than on arm64)
uint64_t rbp, rbx;
uint64_t r12, r13;
uint64_t r14, r15;
uint64_t rsp;
uint64_t _padding_1;
} __attribute__((packed)) co_ctx;
#define CO_SETUP_CONTEXT(co, fn) { \
co->ctx.rbx = (uint64_t)(uintptr_t)&co->resume_value; \
co->ctx.r12 = (uint64_t)(uintptr_t)fn; \
co->ctx.rsp = \
(uint64_t)((uintptr_t)(co->stack + CO_STACK_SIZE) & ~0xF); \
co->ctx.rbp = 0; \
co->ctx.r13 = 0; \
co->ctx.r14 = 0; \
co->ctx.r15 = 0; \
*((uint64_t*)(co->ctx.rsp - 8)) = (uint64_t)(uintptr_t)co_start; \
co->ctx.rsp -= 8; \
}
#elif defined(__aarch64__) || defined(__arm64__)
typedef struct co_ctx {
// callee-saved general-purpose registers
uint64_t x19, x20, x21, x22;
uint64_t x23, x24, x25, x26;
uint64_t x27, x28, x29, x30; // 29=fp 30=lr
uint64_t sp; // stack pointer
uint64_t _padding_1;
// callee-saved floating point registers
double d8, d9, d10, d11;
double d12, d13, d14, d15;
uint64_t fpcr; // floating-point control register
uint32_t fpsr; // floating-point status register
uint32_t _padding_2;
} __attribute__((packed)) co_ctx;
#define CO_SETUP_CONTEXT(co, fn) { \
co->ctx.x19 = (uint64_t)(uintptr_t)&co->resume_value; \
co->ctx.x20 = (uint64_t)(uintptr_t)fn; \
co->ctx.x30 = (uint64_t)(uintptr_t)co_start; \
co->ctx.sp = \
(uint64_t)((uintptr_t)(co->stack + CO_STACK_SIZE) & ~0xF); \
}
#else
#error "Unsupported architecture. Only ARM64 and x86_64 are supported."
#endif
typedef struct co {
char stack[CO_STACK_SIZE];
uint32_t id;
co_state state;
co_ctx ctx;
struct co *resumer; // NULL - resumed from the thread root
void *resume_value;
void *yield_value;
} co;
// global pointer to current thread's coroutine
__thread co *co_current = NULL;
// human-readable enumeration for convenient logging
__thread uint32_t co_next_id = 1;
void co_start(void); // defined as asm
void co_switch(co_ctx *from, co_ctx *to); // defined as asm
// setup red zone at the LOW END of stack (where overflow may occur)
// both x86_64 and arm64: stack grows downward toward co->stack[0]
// red zone goes at the beginning of the stack array (low addresses)
static void co_setup_red_zone(co *co) {
for (int i = 0; i < CO_RED_ZONE; i += 8) {
*(uint64_t*)(co->stack + i) = CO_CANARY;
}
}
static void co_check_red_zone(co *co) {
for (int i = 0; i < CO_RED_ZONE; i += 8) {
if (*(uint64_t*)(co->stack + i) != CO_CANARY) {
log_error("red zone stack overflow");
}
}
}
// arg#1 - function to be run as coroutine
// arg#2 - pass NULL if you want co_create to allocate struct co for you
co *co_create(void (*fn)(void *), co *co) {
if (co == NULL) {
co = calloc(1, sizeof(co));
if (co == NULL) {
log_error_errno("calloc failed");
}
}
co->id = co_next_id++;
co->state = CO_STATE_SUSPENDED;
co->yield_value = NULL;
co->resume_value = NULL;
CO_SETUP_CONTEXT(co, fn);
if (co_next_id == 0) {
log_warning("co_create: uint32_t id overflow");
co_next_id++; // 0 is reserved
}
co_setup_red_zone(co);
return co;
}
// void *value_from_co = co_resume(co, value_for_co);
// when coroutine yields, caller of co_resume will get value_from_co
// when coroutine resumes, coroutine fn will get value_for_co from co_yield
void *co_resume(co *co, void *value) {
if (co->state != CO_STATE_SUSPENDED) {
log_error("attempt to resume %s coroutine; co id: %u",
co->state == CO_STATE_DEAD ? "dead" : "running",
co->id);
}
co->state = CO_STATE_RUNNING;
co->resumer = co_current;
co->resume_value = value;
co_current = co;
co_switch(&co->resumer->ctx, &co->ctx);
return co->yield_value;
}
void *co_yield(void *value) {
co *co = co_current;
if (co->id == 0) {
log_error("cannot yield root thread");
}
if (co->state != CO_STATE_RUNNING) {
log_error("attempt to yield %s coroutine; co id: %u",
co->state == CO_STATE_DEAD ? "dead" : "suspended",
co->id);
}
co_check_red_zone(co);
co->state = CO_STATE_SUSPENDED;
co->yield_value = value;
co_current = co->resumer;
co_switch(&co->ctx, &co->resumer->ctx);
return co->resume_value;
}
void co_finish(void) {
co *co = co_current;
co->state = CO_STATE_DEAD;
co->yield_value = NULL;
co_current = co->resumer;
co_switch(&co->ctx, &co->resumer->ctx);
// coroutine is dead at this point
// this line should never be reached since co_resume checks co->status
}
#if defined(__x86_64__) || defined(__amd64__)
// rdi - pointer to co_ctx *from
// rsi - pointer to co_ctx *to
__asm__(
".text\n"
".globl co_switch\n"
"co_switch:\n"
// save current context to struct
" movq %rbp, 0(%rdi)\n" // save rbp
" movq %rbx, 8(%rdi)\n" // save rbx
" movq %r12, 16(%rdi)\n" // save r12
" movq %r13, 24(%rdi)\n" // save r13
" movq %r14, 32(%rdi)\n" // save r14
" movq %r15, 40(%rdi)\n" // save r15
" movq %rsp, 48(%rdi)\n" // save rsp
// load new context from struct
" movq 0(%rsi), %rbp\n" // load rbp
" movq 8(%rsi), %rbx\n" // load rbx
" movq 16(%rsi), %r12\n" // load r12
" movq 24(%rsi), %r13\n" // load r13
" movq 32(%rsi), %r14\n" // load r14
" movq 40(%rsi), %r15\n" // load r15
" movq 48(%rsi), %rsp\n" // load rsp
" ret\n" // return to address on top of stack
);
__asm__(
".text\n"
".globl co_start\n"
"co_start:\n"
// pass value from rbx as first arg to co fn (rdi)
" movq (%rbx), %rdi\n"
// call - indirect function call: jumps to the address stored in r12
" call *%r12\n"
// when co fn returns, execution continues here
" call co_finish\n"
);
#elif defined(__aarch64__) || defined(__arm64__)
// x0 - pointer to co_ctx *from
// x1 - pointer to co_ctx *to
__asm__(
".text\n"
".globl co_switch\n"
"co_switch:\n"
// save current context - general purpose registers
" stp x19, x20, [x0, #(0*16)]\n"
" stp x21, x22, [x0, #(1*16)]\n"
" stp x23, x24, [x0, #(2*16)]\n"
" stp x25, x26, [x0, #(3*16)]\n"
" stp x27, x28, [x0, #(4*16)]\n"
" stp x29, x30, [x0, #(5*16)]\n"
" mov x2, sp\n"
" str x2, [x0, #(6*16)]\n" // save sp
// save floating-point registers
" stp d8, d9, [x0, #(7*16)]\n"
" stp d10, d11, [x0, #(8*16)]\n"
" stp d12, d13, [x0, #(9*16)]\n"
" stp d14, d15, [x0, #(10*16)]\n"
// save floating-point control and status registers
" mrs x2, fpcr\n"
" mrs x3, fpsr\n"
" stp x2, x3, [x0, #(11*16)]\n"
// load new context - general purpose registers
" ldp x19, x20, [x1, #(0*16)]\n"
" ldp x21, x22, [x1, #(1*16)]\n"
" ldp x23, x24, [x1, #(2*16)]\n"
" ldp x25, x26, [x1, #(3*16)]\n"
" ldp x27, x28, [x1, #(4*16)]\n"
" ldp x29, x30, [x1, #(5*16)]\n"
" ldr x2, [x1, #(6*16)]\n" // only load sp
" mov sp, x2\n"
// load floating-point registers
" ldp d8, d9, [x1, #(7*16)]\n"
" ldp d10, d11, [x1, #(8*16)]\n"
" ldp d12, d13, [x1, #(9*16)]\n"
" ldp d14, d15, [x1, #(10*16)]\n"
// load floating-point control and status registers
" ldp x2, x3, [x1, #(11*16)]\n"
" msr fpcr, x2\n"
" msr fpsr, x3\n"
" ret\n"
);
__asm__(
".text\n"
".globl co_start\n"
"co_start:\n"
// pass value from x19 as first arg to co fn (x0)
" ldr x0, [x19]\n"
// blr - branch with link to register
// indirect function call: jumps to the address stored in x20 and calls it
// saves the return address in x30 (lr)
// x20 contains co fn address (see co_create)
" blr x20\n"
// bl - branch with link
// direct function call: calls co_finish
// when co fn returns (finishes), execution continues here
" bl co_finish\n"
);
#else
#error "Unsupported architecture. Only ARM64 and x86_64 are supported."
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment