Last active
February 13, 2018 19:23
-
-
Save DmitrySoshnikov/4736334 to your computer and use it in GitHub Desktop.
Stop and Copy GC technique.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Stop and Copy Garbage Collection technique. | |
* | |
* See also previous lessons on | |
* | |
* - Mark and Sweep GC: https://gist.github.com/DmitrySoshnikov/4391763 | |
* - Reference Counting (ARC) https://gist.github.com/DmitrySoshnikov/4646658 | |
* | |
* by Dmitry Soshnikov <[email protected]> | |
* MIT Style License | |
*/ | |
// --------------------------------------------------------------------------- | |
// 1. Heap structure, allocation, objects reachability. | |
// --------------------------------------------------------------------------- | |
// For simplicity represent the "heap" (the memory where all our objects live) | |
// as simple JS array. Let's take the "size" of the memory as 20 "abstract points" | |
// (e.g. gigabytes). Address at which objects are allocated is the array index. | |
var heap = Array(20); // heap of size 20 | |
// In Stop and Copy GC, the heap is divided into two spaces: the "Old space" | |
// and the "New Space". The Old space is where our *current objects* live. | |
// The New space in contrast is initially reserved for GC needs. | |
// For simplicity assuming the half of the heap array as a divider of | |
// the Old and New spaces: initially first half (0-9 indices) is Old, | |
// the second half (indices 10-19) is New (mark it with two bars): | |
// | |
// [ Old space || New space ] | |
var OLD_SPACE_BOUND = 0; // beginning of the Old (working) space. | |
var NEW_SPACE_BOUND = 10; // beginning of the New space. | |
// The actual objects allocation happens on the *left* of the *Old space*: | |
// We just reserve needed space and move the allocation pointer . | |
// Everything on the *right* of the allocation pointer is the "Free space" | |
// from where we can allocate further. Still the New space is untouched until GC: | |
// | |
// Old space = Allocated objects + Free space. | |
// | |
// [ Allocated | Free || New space ] | |
// At runtime, initially the allocation pointer is set to the beginning | |
// of the Old space (to zero). | |
var ALLOCATION_POINTER = OLD_SPACE_BOUND; | |
// Helper function to allocate objects, put the object on the heap at allocation | |
// pointer and returns it back (for explanatory purpose we also explicitly keep | |
// the address on the object (the heap array index), where the object is allocated). | |
// It also automatically increases the allocation pointer. | |
function alloc(object) { | |
// Put the object on the heap. | |
heap[ALLOCATION_POINTER] = object; | |
object.address = ALLOCATION_POINTER; | |
// In real practice the allocation pointer is increased by the byte size | |
// of the object, here we have simplified version which just increases it | |
// (move to the next element of the heap array). | |
ALLOCATION_POINTER++; | |
return object; | |
} | |
// Let's add some objects on the heap. | |
// Object "a" is allocated at address 0 and is reachable from the "root": | |
var a = alloc({name: 'a'}); | |
// (Note: "roots" are initial places (usually global variables) from where | |
// we start GC analysis. Read about "roots" in the Mark and Sweep diff: | |
// https://gist.github.com/DmitrySoshnikov/4391763) | |
// Object "b" is allocated at address 1 and is reachable from "a" object: | |
var b = alloc({name: 'b'}); | |
a.b = b.address; // address (array index) where "b" is allocated on the heap | |
// Object "c" is allocated at address 2 and is also reachable from "a": root -> a -> c | |
var c = alloc({name: 'c'}); | |
a.c = c.address; // a.c = c in JS | |
// However, later it's reference from "a" is removed: | |
delete a.c; | |
// Now "c" is a candidate for GC. | |
// Object "d" is allocated at address 3 and is reachable from "b" (which in turn is | |
// reachable from "a": root -> a -> b -> d | |
var d = alloc({name: 'd'}); | |
b.d = d.address; // like b.d = d in JS | |
// But then the "b" reference is removed from "a". | |
delete a.b; | |
// This means that "d" still has the reference to it from "b", but it's | |
// not reachable (since the "b" itself is not reachable anymore). | |
// root -> a --X--> b -> d | |
// Notice the important point: that an object has some references to it | |
// doesn't mean it cannot be GC'ed. The criteria is "reachability", but not the | |
// reference counting here. | |
// Object "e" is allocated at address 4 and is reachable from "a" | |
var e = alloc({name: 'e'}); | |
a.e = e.address; | |
// And "e" also has back-reference to "a". | |
e.a = a.address; | |
// After these manipulations the heap still contains five objects: | |
// [{a}, {b}, {c}, {d}, {e}], but only "a" and "e" objects are reachable | |
// (starting from the root, the "a" object). | |
// --------------------------------------------------------------------------- | |
// 2. Stop and Copy GC overall algorithm | |
// --------------------------------------------------------------------------- | |
// As in the Mark and Sweep GC (https://gist.github.com/DmitrySoshnikov/4391763) | |
// we need to traverse the graph of all reachable objects on the heap. | |
// However, good advantage in contrast with Mark and Sweep is that we don't have to | |
// traverse the whole heap after that to clean the garbage. Although, | |
// we should do some another manipulations -- as the name assumes, Copy the | |
// objects to reorder them in the Old and New space. | |
// When the Stop and Copy GC starts, here when the reserved "New space" | |
// takes palce. Overall idea of S&C GC is very simple: | |
// | |
// 1. Every reachable object is *copied* from the Old space to the New space | |
// | |
// 2. And then Old and New spaces are *swapped* in their roles. Now the New | |
// space becomes the working program runtime space (becomes "Old") and the | |
// previous Old space becomes reserved for the next GC cycle | |
// Eventually after the GC, the heap from the state: | |
// | |
// [{a}, {b}, {c}, {d}, {e} | Free space || New space ] | |
// | |
// should be transformed into: | |
// | |
// [ New space || {a}, {e} | Free space ] | |
// --------------------------------------------------------------------------- | |
// 3. Fixing the pointers issue | |
// --------------------------------------------------------------------------- | |
// The copying itself wouldn't be any sort of problem. In real practice the | |
// bit-for-bit copy is used, so the shape of the objects is kept the same. | |
// Some properties of objects may be references to other objects. However, | |
// because of this bit-for-bit copy, the *pointers* in the objects are copied | |
// as well. Which means the pointers still refer *old* addresses (from the | |
// previous old space!) on the heap. | |
// So after the copying we should *fix all pointers* in the objects to | |
// point to the new space where the objects were copied. | |
// The problem is that at copying an object it's *not obvious* which other | |
// objects refer this object (and we should fix exactly the pointers of those | |
// object to point to the new place where our object is moved: | |
// | |
// E.g.: | |
// | |
// - Object "foo" is copied from address 1 (from Old space) to New address 10. | |
// var foo = {name: 'foo', address: 1}; | |
// var bar = {name: 'bar', address: 2}; | |
// var baz = {name: 'baz', address: 3}; | |
// bar.foo = foo.address; | |
// baz.foo = foo.address; | |
// copyToNewSpace(foo); // copy "foo" at GC to e.g. address 10 | |
// - But "bar.foo" and "baz.foo" still have the old addresses 1 and we can't | |
// tell at copying of "foo" object which objects ("bar" and "baz") refer it | |
// in order to fix their ".foo" pointers to 10. | |
// --------------------------------------------------------------------------- | |
// 4. Solution for the fixing pointers issue, the Forwarding pointers. | |
// --------------------------------------------------------------------------- | |
// A technique which may help us to solve this issue is a *forwarding address* -- | |
// the special marker value which we can put on the object when copy it. | |
// At processing the objects, the trick how we mark the copied objects with the | |
// forwarding addresses and update other objects pointers to it is achieved | |
// by *partitioning* the New space into three parts: | |
// | |
// - Copied and Scanned objects | |
// - Just copied objects | |
// - And the Free space | |
// | |
// The first partition ("Copied and Scanned") contains the objects which we | |
// copied and also *analyzed all its pointer sub-properties* (i.e. updated their | |
// addresses to the new memory locations). | |
// | |
// The "Just copied" section holds the object we've just copied and haven't | |
// processed yet (this is the actual working list of GC algorithm as we'll see). | |
// It appears when we scanning sub-objects of the parent object and | |
// just copy them. The sub-objects themselves will be scanned too, but on the next step. | |
// | |
// And the Free space is obviously the space from where we allocate copied objects. | |
// | |
// [ Old space || Copied & Scanned | Just copied | Free space ] | |
// | |
// The bars separating the sections are the "Scan pointer" (separates the "copied | |
// and scanned" from "just scanned") and the "Allocation pointer" (separates the "Free | |
// space"). | |
// | |
// We should process any object (scan their sub-properties) in the "Just copied" | |
// section (advancing the scan and allocation pointers). If an object is processed, | |
// it's automatically moves to the "Copied and Scanned" section by moving the "Scan" | |
// pointer. And once the scan and allocation pointer are equal -- we're done. | |
// Initially Scan and Allocation pointers are set to the boundary of the | |
// Old and New space (that is, to the middle of our heap array). | |
ALLOCATION_POINTER = NEW_SPACE_BOUND; // We going to allocate from the New space now at GC. | |
var SCAN_POINTER = ALLOCATION_POINTER; // And the Scan pointer is set initially here too. | |
// Helper function to copy objects to new locations. Allocates a copy of the object | |
// in the New space (automatically increasing the allocation pointer), and marks | |
// the old object as copied setting the forwarding address. | |
function copyToNewSpace(object) { | |
// Make a copy | |
var newObject = {}; | |
Object.keys(object).forEach(function(prop) { | |
// Copy all properties except the system addresses. | |
if (prop != 'address' && prop != 'forwardingAddress') { | |
newObject[prop] = object[prop]; | |
} | |
}); | |
// Mark the old object as copied (add the forwarding address) | |
object.forwardingAddress = ALLOCATION_POINTER; | |
// Put on the heap (which increases the allocation pointer). | |
alloc(newObject); | |
return newObject; | |
} | |
// Helper function checking whether a value is an address. We use simplified version | |
// where the address is the heap array index (which are numbers). E.g. a.b = 1. | |
function isPointer(name, value) { | |
// Don't consider "system" addresses fields | |
// (like "address" and "forwardinAddress", check only | |
// user-level pointers) | |
return typeof value == 'number' && name != 'address' | |
&& name != 'forwardingAddress'; | |
} | |
// --------------------------------------------------------------------------- | |
// 5. Actual Stop and Copy GC algorithm. | |
// --------------------------------------------------------------------------- | |
function gc() { | |
// Start copy "root" objects to the New space. For simplicity, | |
// we have only one reachable root object -- the "a" object. Let's copy it | |
// to the new space, by automatically increasing the allocation pointer, but | |
// still keeping the scan pointer at its position. | |
var copiedA = copyToNewSpace(a); | |
a.forwardingAddress = copiedA.address; | |
// [ {a}, {b}, {c}, {d}, {e} || |{a} | Free space ] | |
// Now, when we have copied something, the Scan and Allocation pointers | |
// are different, which means we have something to process (to scan the properties). | |
// For now we have only "a" object there. At scanning we copy all these sub-objects | |
// and mark them as copied too (set the "forwarding address" flag) | |
while (SCAN_POINTER != ALLOCATION_POINTER) { | |
// Get the next object at Scan pointer | |
var objectToScan = heap[SCAN_POINTER]; | |
// And traverse it checking all its reference properties | |
for (var p in objectToScan) if (isPointer(p, objectToScan[p])) { | |
var address = objectToScan[p]; | |
// Get the object to which this reference points to: | |
var objectAtAddress = heap[address]; | |
// If that object hasn't been copied yet (i.e. hasn't forwarding address set) | |
if (!('forwardingAddress' in objectAtAddress)) { | |
// Then we copy this sub-object too and mark it specifying forwarding address. | |
var copiedObjectAtAddress = copyToNewSpace(objectAtAddress); | |
// And we also *fix* the pointer value on the scanning object to | |
// refer to the new location. | |
objectToScan[p] = copiedObjectAtAddress.address; | |
// Else, the object which this sub-property refers to, was already copied at | |
// some previous scan of some other objects (an object can be referred by many | |
// properties in different objects) | |
} else { | |
// Then we just update the pointer to the forwarding address | |
objectToScan[p] = objectAtAddress.forwardingAddress; | |
} | |
} | |
// And then we move to the next object to scan (the sub-objects which were copied | |
// at scanning of their parent object, and which not scanned yet. | |
SCAN_POINTER++; | |
} | |
// Now we can swap Old and New space, making the New space our working | |
// runtime memory, and the Old space reserved for GC. | |
for (var k = OLD_SPACE_BOUND; k < NEW_SPACE_BOUND; k++) { | |
// Just clean old space for the debug purpose. In real practice it's not | |
// necessary, this addresses can be just overridden by later allocations. | |
delete heap[k]; | |
} | |
var tmp = NEW_SPACE_BOUND; | |
NEW_SPACE_BOUND = OLD_SPACE_BOUND; // 0 | |
OLD_SPACE_BOUND = tmp; // 5 | |
} | |
// Heap before GC: | |
// [{a}, {b}, {c}, {d}, {e} | Free space || New Space ] | |
// | |
// Notice, that the address of "e" in the "a" object is 4, and | |
// the back-reference address of "a" on "e" is 0. | |
console.log('Heap before GC:', heap); | |
// Run GC | |
gc(); | |
// After GC: | |
// [ New Space || {a}, {e} | Free space ] | |
// | |
// Notice, that the address of "e" object in the "a" is correctly | |
// updated to the new location, 11, and the the back-reference | |
// address of "a" on "e" is 10. | |
console.log('Heap after GC:', heap); | |
// PS Notes: Stop and Copy considered the fasters GC algorithm (that's said, in contrast | |
// with Mark and Sweep we don't have to post-traverse the heap to remove the garbage. | |
// However, for some languages, such as C/C++, Stop and Copy cannot be applied, since | |
// since there the semantics for references and pointers are explicitly exposed and is | |
// visible for the user-level code and we cannot just move the objects and fix their pointers. | |
// | |
// For further reading I highly recommend this lecture on Stop and Copy GC by Stanford | |
// professor Alex Aiken, https://class.coursera.org/compilers-2012-002/lecture/88 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment