cellularmitosis · November 11, 2018 07:35
diff --git a/output.txt b/output.txt
 All tests passed.
 benchmarking...
 naive malloc: elapsed: 59.271ms, rate: 16,871,657m/s, n: 1,000,000 (1)
 pp_malloc:    elapsed: 28.880ms, rate: 34,626,039m/s, n: 1,000,000
diff --git a/pp_malloc.h b/pp_malloc.h
 /*

 pp_malloc.h: a batching malloc wrapper.
 See https://github.com/pepaslabs/pp_c

 Copyright (c) 2018 Jason Pepas
 Released under the terms of the MIT License.
 See https://opensource.org/licenses/MIT

 This is a single-file C library.
 See https://github.com/nothings/stb/blob/master/docs/stb_howto.txt

 How to use this file:

 - From only one file in your codebase, include this as an implementation:
 #define PP_MALLOC_IMPLEMENTATION
 #include "pp_malloc.h"

 - From as many files as needed, include this as a header:
 #include "pp_malloc.h"

 History:
 - v0.1 (2018/11/10): intial release

 */


 // MARK: - Header

 #ifndef PP_MALLOC_HEADER
 #define PP_MALLOC_HEADER

 #include <stdlib.h> // for size_t, NULL, etc.

 typedef struct _pp_malloc_t pp_malloc_t;

 // Create / destroy a pp_malloc_t.
 pp_malloc_t* pp_malloc_create(size_t object_size);
 void pp_malloc_free(pp_malloc_t** ptr);

 // The primary interface to the batched malloc / free implementation.
 void* pp_malloc(pp_malloc_t* self);
 void pp_free(pp_malloc_t* self, void** ptr);


 #endif // PP_MALLOC_HEADER


 // MARK: - Implementation

 #ifdef PP_MALLOC_IMPLEMENTATION

 #include <assert.h>
 #include <stdbool.h>
 #include <strings.h> // for bzero
 #include <stdint.h>
 #include <limits.h>

 // Similar to CHAR_BIT, LONG_BIT, etc, we define BITMASK_BIT.
 #define BITMASK_BIT 64
 typedef uint64_t bitmask_t;
 #define BITMASK_MAX UINT64_MAX

 // #define BITMASK_BIT 32
 // typedef uint32_t bitmask_t;
 // #define BITMASK_MAX UINT32_MAX

 // #define BITMASK_BIT 16
 // typedef uint16_t bitmask_t;
 // #define BITMASK_MAX UINT16_MAX

 // #define BITMASK_BIT 8
 // typedef uint8_t bitmask_t;
 // #define BITMASK_MAX UINT8_MAX

 // MARK: - bitmask functions

 // Check if a bitmask is all binary 1's.
 static bool _bitmask_is_full(bitmask_t bitmask) {
    return bitmask == (bitmask_t)BITMASK_MAX;
 }

 // Check if an individual bit is set in a bitmask.
 static bool _bit_is_set(bitmask_t bitmask, uint8_t bit_index) {
    return bitmask & (1UL << bit_index);
 }

 // Return the index of the first unset bit in a bitmask.
 uint8_t _bitmask_index_of_first_unset_bit(bitmask_t bitmask, uint8_t start) {
    for (uint8_t i = start; i < BITMASK_BIT; i++) {
        if (!_bit_is_set(bitmask, i)) {
            return i;
        }
    }
    return UINT8_MAX;
 }

 // Set the bit at the given index to 1.
 void _set_bit(bitmask_t* bitmask, uint8_t bit_index) {
    (*bitmask) |= (1UL << bit_index);
 }

 // Clear the bit at the given index to 0.
 void _clear_bit(bitmask_t* bitmask, uint8_t bit_index) {
    (*bitmask) &= ~(1UL << bit_index);
 }


 // MARK: - slab_t implementation

 typedef unsigned char byte_t;

 // pp_slab_t: A slab of memory, sliced up into "slots".
 // A slab has 32 slots on i386, 64 slots on x86_64, etc.
 // 'bitmask' indicates the used / free status of each slot.
 // A slab_t is "over-allocated": slot0 is merely a placeholder
 // address which marks the start of the slots.
 // The size of a slab is calculated as:
 //   sizeof(slab_t) - 1 + (BITMASK_BIT * slot_size)
 struct _pp_slab_t {
    bitmask_t bitmask;
    uint8_t first_free_index;
    size_t slot_size;
    byte_t slot0[1];
 };
 typedef struct _pp_slab_t pp_slab_t;

 // Allocate, initialize and return a slab.
 pp_slab_t* pp_slab_create(size_t slot_size) {
    pp_slab_t* slab = (pp_slab_t*)malloc(sizeof(pp_slab_t) - 1 + BITMASK_BIT * slot_size);
    assert(slab != NULL);
    slab->bitmask = 0;
    slab->first_free_index = 0;
    slab->slot_size = slot_size;
    assert(slab->slot0 != NULL);
    return slab;
 }

 // Return a slab's memory to the operating system.
 void pp_slab_free(pp_slab_t** slab) {
    assert(slab != NULL && *slab != NULL);
    free(*slab);
    *slab = NULL;
 }

 // Check whether a slab is full.
 bool pp_slab_is_full(pp_slab_t *slab) {
    assert(slab != NULL);
    return _bitmask_is_full(slab->bitmask);
 }

 // Return the index of the first free slot in a slab.
 // Note: It is an error call this function when the slab is full.
 static uint8_t _pp_slab_next_free_index(pp_slab_t* slab) {
    assert(slab != NULL);
    return slab->first_free_index;
 }

 // Return a pointer to the slot at the given index in a slab.
 static void* _pp_slab_slot(pp_slab_t* slab, uint8_t slot_index) {
    assert(slab != NULL);
    byte_t* addr = &(slab->slot0[slot_index * slab->slot_size]);
    return (void*)addr;
 }

 // Mark the slot at the given index in a slab as used.
 static void _pp_slab_mark_used(pp_slab_t* slab, uint8_t slot_index) {
    assert(slab != NULL);
    _set_bit(&(slab->bitmask), slot_index);
    slab->first_free_index = _bitmask_index_of_first_unset_bit(slab->bitmask, slot_index);
 }

 // Mark the slot at the given index in a slab as available.
 static void _pp_slab_mark_free(pp_slab_t* slab, uint8_t slot_index) {
    assert(slab != NULL);
    _clear_bit(&(slab->bitmask), slot_index);
    if (slot_index < slab->first_free_index) {
        slab->first_free_index = slot_index;
    }
 }

 // Find the next available slot in a slab, mark it as used, and return its address.
 // Note: It is an error to call this function on a full slab.
 void* pp_slab_claim_next(pp_slab_t* slab) {
    assert(slab != NULL);
    assert(pp_slab_is_full(slab) == false);
    uint8_t i = _pp_slab_next_free_index(slab);
    _pp_slab_mark_used(slab, i);
    return _pp_slab_slot(slab, i);
 }

 // Return the slot index of the given pointer (assuming it lies within this slab).
 static uint8_t _pp_slab_slot_index_of_ptr(pp_slab_t* slab, void* ptr) {
    assert(slab != NULL);
    assert(ptr >= (void*)slab);
    assert(ptr <= (void*)(slab->slot0) + (slab->slot_size * BITMASK_BIT));
    unsigned int byte_offset = slab->slot0 - (byte_t*)ptr;
    assert(byte_offset % slab->slot_size == 0);
    return byte_offset / slab->slot_size;
 }

 // If the given pointer lies within this slot, mark it free and return true.
 bool pp_slab_release(pp_slab_t* slab, void** ptr) {
    assert(slab != NULL);
    void* first = (void*)(slab->slot0);
    void* last = (void*)(slab->slot0) + (slab->slot_size * BITMASK_BIT);
    if (*ptr < first || *ptr >= last) {
        return false;
    }
    uint8_t i = _pp_slab_slot_index_of_ptr(slab, *ptr);
    _pp_slab_mark_free(slab, i);
    *ptr = NULL;
    return true;
 }


 // MARK: - pp_slabtree_t implementation

 // A union to access the children of a slabtree as either slabtrees or slabs.
 // "leaf" nodes (height == 1) will use 'as_slabs', while "internal" nodes
 // (height > 1) will use 'as_slabtrees'.
 union _pp_slabtree_children_t {
    struct _pp_slabtree_t* as_slabtrees[BITMASK_BIT];
    struct _pp_slab_t* as_slabs[BITMASK_BIT];
 };
 typedef union _pp_slabtree_children_t pp_slabtree_children_t;

 // pp_slabtree_t: a tree to organize slabs.
 struct _pp_slabtree_t {
    bitmask_t bitmask;
    // The index of the first slot which isn't full.
    // (This is a cache to avoid scanning the bitmask).
    uint8_t first_free_index;
    // The address range of all slabs within this (sub)tree.
    // Note: this is going to cause a performance problem because
    // slab ranges will overlap as a tree grows, eventually forcing
    // a search of (most of) the entire tree to find a pointer's slab.
    void* addr_range_first;
    void* addr_range_last;
    // The height of this tree node.
    // slabtrees which have slabs as children have a height of 1.
    // "internal" slabtrees have a height > 1.
    uint16_t height;
    union _pp_slabtree_children_t children;
 };
 typedef struct _pp_slabtree_t pp_slabtree_t;

 // Allocate, initialize and return a slabtree.
 pp_slabtree_t* pp_slabtree_create() {
    pp_slabtree_t* tree = (pp_slabtree_t*)malloc(sizeof(pp_slabtree_t));
    bzero(tree, sizeof(pp_slabtree_t));
    tree->height = 1;
    return tree;
 }

 // Return a slabtree's memory to the operating system.
 // Recursively frees its entire subtree, including slabs.
 void pp_slabtree_free(pp_slabtree_t** ptr) {
    pp_slabtree_t* t = *ptr;
    if (t->height > 1) {
        for (uint8_t i = 0; i < BITMASK_BIT; i++) {
            if (t->children.as_slabtrees[i] != NULL) {
                pp_slabtree_free(&(t->children.as_slabtrees[i]));
            }
        }
    } else {
        for (uint8_t i = 0; i < BITMASK_BIT; i++) {
            if (t->children.as_slabs[i] != NULL) {
                pp_slab_free(&(t->children.as_slabs[i]));
            }
        }
    }
    *ptr = NULL;
 }

 // Check if all of the slabs within this (sub)tree are full.
 bool pp_slabtree_is_full(pp_slabtree_t* tree) {
    assert(tree != NULL);
    return _bitmask_is_full(tree->bitmask);
 }

 // Grow a slabtree up and to the right.
 static void pp_slabtree_grow(pp_slabtree_t** tree) {
    assert(tree != NULL && *tree != NULL);
    pp_slabtree_t* t = *tree;
    pp_slabtree_t* parent = pp_slabtree_create();
    parent->height = t->height + 1;
    parent->children.as_slabtrees[0] = t;
    parent->bitmask = 1;
    parent->first_free_index = 1;
    *tree = parent;
 }

 // Find the next available slot, mark it as used and return it's pointer.
 // Note: It is an error to call this function on a full tree.
 void* pp_slabtree_claim_next(pp_slabtree_t* tree, size_t object_size) {
    assert(tree != NULL);
    uint8_t i = tree->first_free_index;
    void* ret;

    // This is an internal node.
    if (tree->height > 1) {
        // Grab the first non-full subtree.
        pp_slabtree_t* child = tree->children.as_slabtrees[i];
        // ...which might not have been allocated yet.
        if (child == NULL) {
            // ...so create one and stitch it into the tree.
            child = pp_slabtree_create();
            child->height = tree->height - 1;
            tree->children.as_slabtrees[i] = child;
        }

        // Recurse down into the subtree until you hit a leaf node.
        ret = pp_slabtree_claim_next(child, object_size);

        // If the subtree's bitmask has changed, update our first_free_index.
        if (_bitmask_is_full(child->bitmask)) {
            _set_bit(&(tree->bitmask), i);
            tree->first_free_index = _bitmask_index_of_first_unset_bit(tree->bitmask, i);
        }

        // Update this node's address range book-keeping.
        if (ret < tree->addr_range_first || ret >= tree->addr_range_last) {
            // Scan our subtrees, adjusting our total address range as needed.
            // FIXME: actually, this is broken.  this implements a one-way
            // ratcheting mechanism which never recedes.  Fix this by setting
            // first to MAX and last to 0 before starting the scan.
            for (uint8_t i = 0; i < BITMASK_BIT; i++) {
                child = tree->children.as_slabtrees[i];
                if (child == NULL) {
                    continue;
                }
                if (child->addr_range_first < tree->addr_range_first) {
                    tree->addr_range_first = child->addr_range_first;
                } else if (child->addr_range_last > tree->addr_range_last) {
                    tree->addr_range_last = child->addr_range_last;
                }
            }
        }
    }
    // This is a leaf node.
    else {
        // Grab the first non-full slab.
        pp_slab_t* slab = tree->children.as_slabs[i];
        // ...which might not have been allocated yet.
        if (slab == NULL) {
            // ...so create one and stitch it into the tree.
            slab = pp_slab_create(object_size);
            tree->children.as_slabs[i] = slab;

            // Update this node's address range book-keeping.
            if ((void*)slab < tree->addr_range_first) {
                tree->addr_range_first = slab;
            } else if ((void*)slab >= tree->addr_range_last) {
                void* last = (void*)(slab->slot0) + (slab->slot_size * BITMASK_BIT);
                tree->addr_range_last = last;
            }
        }

        // Claim the next available slot in the slab.
        ret = pp_slab_claim_next(slab);

        // If the slab's bitmask has changed, update our first_free_index.
        if (_bitmask_is_full(slab->bitmask)) {
            _set_bit(&(tree->bitmask), i);
            tree->first_free_index = _bitmask_index_of_first_unset_bit(tree->bitmask, i);
        }
    }

    // Bubble the recursive result back up.
    return ret;
 }

 // Find the slot of the given pointer and mark it as available.
 bool pp_slabtree_release(pp_slabtree_t* tree, void** ptr) {
    assert(tree != NULL);
    // If ptr was outside of this subtree's range, return false.
    if (*ptr < tree->addr_range_first || *ptr > tree->addr_range_last) {
        return false;
    }

    // If this is an internal tree node...
    if (tree->height > 1) {
        // Scan through all (64) subtrees, looking for ptr's subtree.
        // The fact that slabs are not guaranteed to be in increasing
        // order of address space is what necessitates this complete
        // scan of all 64 subtrees, because the slab ranges of the
        // subtrees can overlap.  This is problematic for performance.
        // To address this issue, I will likely need to move to a
        // tree which maintains slab sort order (which isn't so bad
        // if we pre-allocate all 64 slabs of a leaf node, because then
        // we just have to shuffle the tree nodes, rather than shuffling
        // individual slabs).
        for (uint8_t i = 0; i < BITMASK_BIT; i++) {
            pp_slabtree_t* child = tree->children.as_slabtrees[i];
            if (child == NULL) {
                continue;
            }
            // If ptr was in this subtree, end the scan.
            bool result = pp_slabtree_release(child, ptr);
            if (result) {
                return result;
            }
        }
        // The scan completed and we didn't find ptr within our subtree.
        return false;
    }
    // Otherwise, this is a leaf node.
    else {
        // Scan the slabs, looking for ptr.
        for (uint8_t i = 0; i < BITMASK_BIT; i++) {
            pp_slab_t* slab = tree->children.as_slabs[i];
            if (slab == NULL) {
                continue;
            }
            // If ptr was in this slab, end the scan.
            bool result = pp_slab_release(slab, ptr);
            if (result) {
                return result;
            }
        }
        // The scan completed and we didn't find ptr in any of our slabs.
        return false;
    }
 }


 // MARK: - pp_malloc_t implementation

 // pp_malloc_t: a user-space managed memory pool.
 struct _pp_malloc_t {
    pp_slabtree_t* tree;
    size_t object_size;
 };

 // Allocate, initialize and return a user-space malloc pool.
 pp_malloc_t* pp_malloc_create(size_t object_size) {
    pp_malloc_t* self = (pp_malloc_t*)malloc(sizeof(pp_malloc_t));
    self->object_size = object_size;
    self->tree = pp_slabtree_create();
    pp_slab_t* slab = pp_slab_create(object_size);
    self->tree->children.as_slabs[0] = slab;
    self->tree->addr_range_first = slab;
    void* last = (void*)(slab->slot0) + (slab->slot_size * BITMASK_BIT);
    self->tree->addr_range_last = last;
    return self;
 }

 // Return the memory of a user-space malloc pool to the operating system.
 void pp_malloc_free(pp_malloc_t** ptr) {
    pp_slabtree_free(&((*ptr)->tree));
    free(*ptr);
    *ptr = NULL;
 }

 // A user-space malloc wrapper, designed to decrease malloc overhead
 // by batching (64) calls to malloc.
 // Initial (trivial) testing shows this to be 1.5x to 2x as fast
 // as unwrapped malloc calls.  However, I have yet tested a workload
 // involving heavy use of 'pp_free', so performance might not be 2x
 // for long-lived processes.
 void* pp_malloc(pp_malloc_t* self) {
    assert(self != NULL);
    if (pp_slabtree_is_full(self->tree)) {
        pp_slabtree_grow(&(self->tree));
    }
    return pp_slabtree_claim_next(self->tree, self->object_size);
 }

 // This is the corresponding free wrapper.
 void pp_free(pp_malloc_t* self, void** ptr) {
    assert(self != NULL);
    assert(ptr != NULL && *ptr != NULL);
    assert(pp_slabtree_release(self->tree, ptr));
 }

 #endif // PP_MALLOC_IMPLEMENTATION


 // MARK: - Tests

 #ifdef PP_MALLOC_TESTS

 static void test_bitmask_is_full() {
    bitmask_t bitmask = 0;
    assert(_bitmask_is_full(bitmask) == false);
    bitmask--;
    assert(_bitmask_is_full(bitmask) == true);
    bitmask = 12345;
    assert(_bitmask_is_full(bitmask) == false);
 }

 static void test_bit_is_set() {
    bitmask_t bitmask = 0;
    assert(_bit_is_set(bitmask, 0) == false);
    assert(_bit_is_set(bitmask, 1) == false);
    assert(_bit_is_set(bitmask, 2) == false);
    bitmask = 1;
    assert(_bit_is_set(bitmask, 0) == true);
    assert(_bit_is_set(bitmask, 1) == false);
    assert(_bit_is_set(bitmask, 2) == false);
    bitmask = 2;
    assert(_bit_is_set(bitmask, 0) == false);
    assert(_bit_is_set(bitmask, 1) == true);
    assert(_bit_is_set(bitmask, 2) == false);
    bitmask = 3;
    assert(_bit_is_set(bitmask, 0) == true);
    assert(_bit_is_set(bitmask, 1) == true);
    assert(_bit_is_set(bitmask, 2) == false);
    bitmask = 4;
    assert(_bit_is_set(bitmask, 0) == false);
    assert(_bit_is_set(bitmask, 1) == false);
    assert(_bit_is_set(bitmask, 2) == true);
 }

 static void test_bitmask_index_of_first_unset_bit() {
    bitmask_t bitmask = 0;
    assert(_bitmask_index_of_first_unset_bit(bitmask, 0) == 0);
    bitmask = 1;
    assert(_bitmask_index_of_first_unset_bit(bitmask, 0) == 1);
    bitmask = 3;
    assert(_bitmask_index_of_first_unset_bit(bitmask, 0) == 2);
    bitmask = 7;
    assert(_bitmask_index_of_first_unset_bit(bitmask, 0) == 3);
    bitmask = -1;
    assert(_bitmask_index_of_first_unset_bit(bitmask, 0) == UINT8_MAX);
 }

 static void test_set_bit() {
    bitmask_t bitmask = 0;
    _set_bit(&bitmask, 0);
    assert(bitmask == 1);
    _set_bit(&bitmask, 1);
    assert(bitmask == 3);
    _set_bit(&bitmask, 2);
    assert(bitmask == 7);

    bitmask = 0;
    _set_bit(&bitmask, 31);
    assert(bitmask == 2147483648UL);

    bitmask = 0;
    _set_bit(&bitmask, 63);
    // hmm, "error: integer literal is too large to be represented in any integer type"
    // assert(bitmask == 18446744073709551616UL);
    // thanks to https://stackoverflow.com/questions/35136202/how-do-i-mute-this-error-integer-literal-is-too-large-to-be-represented-in-a#comment57995212_35136303
    // turns out integer constants aren't actually unsigned?
    assert(bitmask == 0x8000000000000000);
 }

 static void test_clear_bit() {
    bitmask_t bitmask = 7;
    _clear_bit(&bitmask, 0);
    assert(bitmask == 6);
    _clear_bit(&bitmask, 1);
    assert(bitmask == 4);
    _clear_bit(&bitmask, 2);
    assert(bitmask == 0);
 }

 static void test_pp_slab_claim_next() {
    size_t slot_size = 1;
    pp_slab_t* slab = pp_slab_create(slot_size);
    assert(slab->bitmask == 0);

    void* ret = pp_slab_claim_next(slab);
    assert(ret != NULL);
    assert(slab->bitmask == 1);

    for (int i = 1; i < BITMASK_BIT; i++) {
        pp_slab_claim_next(slab);
    }
    assert(pp_slab_is_full(slab));
    pp_slab_free(&slab);
 }

 static void test_pp_slab_is_full() {
    size_t slot_size = 1;
    pp_slab_t* slab = pp_slab_create(slot_size);
    for (int i = 0; i < BITMASK_BIT; i++) {
        pp_slab_claim_next(slab);
    }
    assert(pp_slab_is_full(slab));
    pp_slab_free(&slab);
 }

 static void test_pp_slabtree_is_full() {
    pp_malloc_t* md = pp_malloc_create(1);

    size_t n = BITMASK_BIT - 1;
    for (size_t i = 0; i < n; i++) {
        pp_malloc(md);
    }
    assert(!pp_slab_is_full(md->tree->children.as_slabs[0]));
    assert(md->tree->bitmask == 0);

    pp_malloc(md);

    assert(pp_slab_is_full(md->tree->children.as_slabs[0]));
    assert(md->tree->bitmask == 1);
 }

 static void test_slab_create_and_free() {
    pp_slab_t* s = pp_slab_create(1);
    pp_slab_free(&s);
    assert(s == NULL);
 }

 static void test_slabtree_create_and_free() {
    pp_slabtree_t* s = pp_slabtree_create(1);
    pp_slabtree_free(&s);
    assert(s == NULL);
 }

 static void test_malloc_create_and_free() {
    pp_malloc_t* m = pp_malloc_create(1);
    pp_malloc_free(&m);
    assert(m == NULL);

    m = pp_malloc_create(1);
    // create a tree of depth 1:
    size_t n = BITMASK_BIT;
    for (size_t i = 0; i < n; i++) {
        pp_malloc(m);
    }
    pp_malloc_free(&m);
    assert(m == NULL);

    m = pp_malloc_create(1);
    // create a tree of depth 2:
    n = BITMASK_BIT * BITMASK_BIT;
    for (size_t i = 0; i < n; i++) {
        pp_malloc(m);
    }
    pp_malloc_free(&m);
    assert(m == NULL);

    m = pp_malloc_create(1);
    // create a tree of depth 3:
    n = BITMASK_BIT * BITMASK_BIT * BITMASK_BIT;
    for (size_t i = 0; i < n; i++) {
        pp_malloc(m);
    }
    pp_malloc_free(&m);
    assert(m == NULL);
 }

 static void test_malloc_and_free() {
    pp_malloc_t* m = pp_malloc_create(1);
    void* ptr = pp_malloc(m);
    pp_free(m, &ptr);
    assert(ptr == NULL);
    pp_malloc_free(&m);
    assert(m == NULL);
 }

 /*

 To TDD this file, implement a main.c:

  #define PP_MALLOC_IMPLEMENTATION
  #define PP_MALLOC_TESTS
  #include "pp_malloc.h"
  int main() { test(); return 0; }

 Then run this in a shell:

  watch -d -n1 'gcc -std=c99 -Wall foo.c && ./a.out'

 Or, if you don't have 'watch' installed:

  while true ; do sleep 1 ; gcc -std=c99 -Wall main.c && ./a.out ; done

 Each time you edit and save, look at the terminal to see if the tests are
 still passing.

 */

 #include <stdio.h>

 void test() {
    test_bitmask_is_full();
    test_bit_is_set();
    test_bitmask_index_of_first_unset_bit();
    test_set_bit();
    test_clear_bit();
    test_pp_slab_claim_next();
    test_pp_slab_is_full();
    test_pp_slabtree_is_full();
    test_slab_create_and_free();
    test_slabtree_create_and_free();
    test_malloc_create_and_free();
    test_malloc_and_free();
    printf("All tests passed.\n");
 }

 #endif // PP_MALLOC_TESTS


 // MARK: - Performance testing

 #ifdef PP_MALLOC_BENCHMARK

 #include <stdio.h>
 #include <sys/time.h> // for gettimeofday()
 #include <locale.h>

 static unsigned long benchmark_naive_malloc(size_t object_size, size_t n) {
    unsigned long sum = 0;
    for (size_t i = 0; i < n; i++) {
        sum = sum + (unsigned long)malloc(object_size);
    }
    return sum;
 }

 static void benchmark_batched_malloc(size_t object_size, size_t n) {
    pp_malloc_t* md = pp_malloc_create(object_size);
    for (size_t i = 0; i < n; i++) {
        pp_malloc(md);
    }
 }

 void benchmark() {
    setlocale(LC_NUMERIC, "");
    printf("benchmarking...\n");

    struct timeval then;
    struct timeval now;
    double elapsed1;
    double elapsed2;
    double rate1;
    double rate2;

    size_t object_size = 1;
    size_t n = 1000000;

    // this is just to avoid having the optimizer turn the naive test into a no-op.
    unsigned long sum;

    gettimeofday(&then, NULL);
    sum = benchmark_naive_malloc(object_size, n);
    gettimeofday(&now, NULL);
    // thanks to https://stackoverflow.com/a/2150334/7543271
    elapsed1 = (now.tv_sec - then.tv_sec) + (now.tv_usec - then.tv_usec) / 1000000.0;
    rate1 = n / elapsed1;
    printf("naive malloc: elapsed: %'0.3fms, rate: %'0.0fm/s, n: %'lu (%i)\n", elapsed1 * 1000, rate1, n, (bool)sum);

    gettimeofday(&then, NULL);
    benchmark_batched_malloc(object_size, n);
    gettimeofday(&now, NULL);
    elapsed2 = (now.tv_sec - then.tv_sec) + (now.tv_usec - then.tv_usec) / 1000000.0;
    rate2 = n / elapsed2;
    printf("pp_malloc:    elapsed: %'0.3fms, rate: %'0.0fm/s, n: %'lu\n", elapsed2 * 1000, rate2, n);

    // factor = 100.0 / ((double)elapsed2 / (double)elapsed1);
    // printf("speedup: %0.1f%% faster\n", factor);
 }

 #endif // PP_MALLOC_BENCHMARK
diff --git a/pp_malloc_benchmark.c b/pp_malloc_benchmark.c
 #define PP_MALLOC_IMPLEMENTATION
 #define PP_MALLOC_BENCHMARK
 #include "pp_malloc.h"
 int main() { benchmark(); return 0; }
diff --git a/pp_malloc_test.c b/pp_malloc_test.c
 #define PP_MALLOC_IMPLEMENTATION
 #define PP_MALLOC_TESTS
 #include "pp_malloc.h"
 int main() { test(); return 0; }
diff --git a/run.sh b/run.sh
 #!/bin/bash

 set -e

 gcc -O3 -std=c99 -Wall pp_malloc_test.c && ./a.out 
 gcc -O3 -std=c99 -Wall pp_malloc_benchmark.c && ./a.out
	All tests passed.
	benchmarking...
	naive malloc: elapsed: 59.271ms, rate: 16,871,657m/s, n: 1,000,000 (1)
	pp_malloc: elapsed: 28.880ms, rate: 34,626,039m/s, n: 1,000,000
	#define PP_MALLOC_IMPLEMENTATION
	#define PP_MALLOC_BENCHMARK
	#include "pp_malloc.h"
	int main() { benchmark(); return 0; }
	#define PP_MALLOC_IMPLEMENTATION
	#define PP_MALLOC_TESTS
	#include "pp_malloc.h"
	int main() { test(); return 0; }
	#!/bin/bash

	set -e

	gcc -O3 -std=c99 -Wall pp_malloc_test.c && ./a.out
	gcc -O3 -std=c99 -Wall pp_malloc_benchmark.c && ./a.out