pervognsen · March 11, 2025 15:38
diff --git a/linear_gc.c b/linear_gc.c
 // Linear-scan mark-compact collector for causal data structures, i.e. the reference graph is acyclic and the objects
 // are ordered by age in the heap (which happens if you use a linear allocator) so they are automatically topologically sorted.
 // This algorithm has very high memory-level parallelism which can be exploited by modern out-of-order processors, and should
 // be memory throughput limited.
 void collect(void) {
    // Initialize marks from roots.
    memset(marks, 0, num_nodes * sizeof(uint32_t));
    int newest = 0, oldest = num_nodes;
    for (int i = 0; i < num_roots; i++) {
        marks[roots[i]] = 1;
        newest = max(roots[i], newest); // No live object can be newer than the newest root.
        oldest = min(roots[i], oldest); // The oldest live object seen so far is the oldest root.
    }

    // Pass 1: Propagate liveness from newest to oldest potentially live objects.
    int sum = 0;
    Stack live;
    for (int p = newest; p >= oldest; p--) {
        // If you anticipate dealing with a lot of dead objects you can make this a bitmap scan. That is, use
        // a mark bitmap, scan for nonzero words (when a word is 0 you effectively skip 64 dead objects in one step)
        // and use bit-scan-reverse instructions to enumerate set bit positions from a nonzero mark word.
        if (marks[p]) {
            push(live, p);
            // Note that we only have to load data from live objects. You can make this pass entirely branchless as
            // pass 2 is but it would generate more memory traffic for dead objects. Depending on the density
            // and clustering of live vs dead objects, the branch mispredicts from 'if (marks[r])' could be worth avoiding.
            // A priori, I would assume that live and dead objects are heavily clustered in which case even a naive branch
            // predictor would do really well (no mispredicts within a homogeneous cluster).    
            Node *n = nodes + p;
            for (int i = 0; i < NUM_KIDS; i++) {
                oldest = min(n->kids[i], oldest); // Push the liveness frontier backward in time.
                marks[n->kids[i]] = 1; // This is the only random access to memory in pass 1, but stores are fire-and-forget.
            }
            marks[p] = sum++; // Overwrite marks with their prefix sum.
        }
    }
    num_nodes = sum--;

    // You don't need a stack (just check the mark) but it helps pass 2 when you have a lot of dead objects.
    // If you use a compact data structure for the mark bitmap with fast rank and select queries you would use select
    // to iterate through just the live objects (the purpose of the stack) and rank to determine the relocated position
    // of a live object (the purpose of the prefix sums in the overwritten marks array). That's ideal for memory compactness
    // since the standard rank bitmap only takes ~1.25 bits per element in the universe.

    // Pass 2: Compact from oldest to newest actually live objects.
    for (int dest = 0; dest < num_nodes; dest++) {
        int src = pop(live); // select(dest)
        // The nodes[src] loads are linear but sparse. With reasonable clustering they should be limited by throughput, not latency.
        // The mark reads are random access but in practice (especially after surviving compaction once) children and parents
        // have very good locality. And the loads for different children can proceed in parallel and indeed the loads across
        // several iterations of the outer dest loop can fully overlap.
        for (int i = 0; i < NUM_KIDS; i++)
            nodes[dest].kids[i] = sum - marks[nodes[src].kids[i]]; // rank(nodes[src].kids[i])
    }
    
    // We also have to update the roots to point to the new positions.
    for (int i = 0; i < num_roots; i++)
        roots[i] = sum - marks[roots[i]];
 }
	// Linear-scan mark-compact collector for causal data structures, i.e. the reference graph is acyclic and the objects
	// are ordered by age in the heap (which happens if you use a linear allocator) so they are automatically topologically sorted.
	// This algorithm has very high memory-level parallelism which can be exploited by modern out-of-order processors, and should
	// be memory throughput limited.
	void collect(void) {
	// Initialize marks from roots.
	memset(marks, 0, num_nodes * sizeof(uint32_t));
	int newest = 0, oldest = num_nodes;
	for (int i = 0; i < num_roots; i++) {
	marks[roots[i]] = 1;
	newest = max(roots[i], newest); // No live object can be newer than the newest root.
	oldest = min(roots[i], oldest); // The oldest live object seen so far is the oldest root.
	}

	// Pass 1: Propagate liveness from newest to oldest potentially live objects.
	int sum = 0;
	Stack live;
	for (int p = newest; p >= oldest; p--) {
	// If you anticipate dealing with a lot of dead objects you can make this a bitmap scan. That is, use
	// a mark bitmap, scan for nonzero words (when a word is 0 you effectively skip 64 dead objects in one step)
	// and use bit-scan-reverse instructions to enumerate set bit positions from a nonzero mark word.
	if (marks[p]) {
	push(live, p);
	// Note that we only have to load data from live objects. You can make this pass entirely branchless as
	// pass 2 is but it would generate more memory traffic for dead objects. Depending on the density
	// and clustering of live vs dead objects, the branch mispredicts from 'if (marks[r])' could be worth avoiding.
	// A priori, I would assume that live and dead objects are heavily clustered in which case even a naive branch
	// predictor would do really well (no mispredicts within a homogeneous cluster).
	Node *n = nodes + p;
	for (int i = 0; i < NUM_KIDS; i++) {
	oldest = min(n->kids[i], oldest); // Push the liveness frontier backward in time.
	marks[n->kids[i]] = 1; // This is the only random access to memory in pass 1, but stores are fire-and-forget.
	}
	marks[p] = sum++; // Overwrite marks with their prefix sum.
	}
	}
	num_nodes = sum--;

	// You don't need a stack (just check the mark) but it helps pass 2 when you have a lot of dead objects.
	// If you use a compact data structure for the mark bitmap with fast rank and select queries you would use select
	// to iterate through just the live objects (the purpose of the stack) and rank to determine the relocated position
	// of a live object (the purpose of the prefix sums in the overwritten marks array). That's ideal for memory compactness
	// since the standard rank bitmap only takes ~1.25 bits per element in the universe.

	// Pass 2: Compact from oldest to newest actually live objects.
	for (int dest = 0; dest < num_nodes; dest++) {
	int src = pop(live); // select(dest)
	// The nodes[src] loads are linear but sparse. With reasonable clustering they should be limited by throughput, not latency.
	// The mark reads are random access but in practice (especially after surviving compaction once) children and parents
	// have very good locality. And the loads for different children can proceed in parallel and indeed the loads across
	// several iterations of the outer dest loop can fully overlap.
	for (int i = 0; i < NUM_KIDS; i++)
	nodes[dest].kids[i] = sum - marks[nodes[src].kids[i]]; // rank(nodes[src].kids[i])
	}

	// We also have to update the roots to point to the new positions.
	for (int i = 0; i < num_roots; i++)
	roots[i] = sum - marks[roots[i]];
	}