KubaO · December 5, 2020 08:28
diff --git a/matrix-multiply-v1-65117221 b/matrix-multiply-v1-65117221
 // complete compileable example begins
 #include <assert.h>
 #include <math.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>

 #define TRACK_MEMORY_USE 1
 #define TRACK_LIFETIME 0
 #define TRACK_VIEWS 0
 #define TRACK_DUMPS 0
 #define TRACK_ALLOCS 1

 #define KERNEL_SIZE 16   // 16-32 is the sweet spot usually
 #define TRANSPOSE_B 1    // enabling it improves things very slightly

 #if defined(__GNUC__) || defined(__clang__)
 #define _nodiscard_ __attribute__((warn_unused_result))
 #else
 #define _nodiscard_
 #endif

 enum TrackEvent {
    TE_CREATE,  // param = size of object
    TE_USE,
    TE_DESTROY, // param = size of object
    TE_ISEMPTY,
    TE_COUNT,
    TE_ALLOC,
    TE_MAX_COUNT,
    TE_MAX_ALLOC,
 };

 #if TRACK_MEMORY_USE || TRACK_LIFETIME
 size_t obj_track(enum TrackEvent event, const void *obj, size_t param);
 #else
 size_t obj_track(enum TrackEvent event, const void *obj, size_t param) { return 1; }
 #endif

 #define mat_use_check(M) do { \
    /* A view of a view must still refer directly to the shown matrix. */ \
    assert(!M->shown || !M->shown->shown); \
    assert(obj_track(TE_USE, mat_shown(M), 0)); \
 } while (0)

 struct Matrix {
    struct Matrix *shown; // a matrix being viewed, if any
    float *ptr;
    int n; // rows and columns in this range
    int row_stride; // distance between beginnings of rows (units: elements)
    int16_t tmp_count; // number of active temporary uses of this object
 } typedef Matrix;

 int strassen_entry_count;

 //! Returns the matrix being shown if M is a view, otherwise return M itself
 inline const Matrix *mat_shown(const Matrix *M) {
    return M->shown ? M->shown : M;
 }

 Matrix *mat_create(int n);
 void free_all(Matrix **M1, ...);
 void free_temp(Matrix *M);
 void free_all_temp(Matrix *M1, ...);
 static bool mat_same_layouts(const Matrix *A, const Matrix *B);
 void mat_print(const Matrix *M);

 Matrix *mat_transpose(Matrix *M);
 Matrix mat_block_view(const Matrix *matrix, int i, int j, int nbl);
 Matrix mat_linear_block_view(const Matrix *matrix, int i, int j, int nbl);

 Matrix *mat_add_to(Matrix *C, Matrix *A);
 Matrix *mat_sub_from(Matrix *C, Matrix *A);
 Matrix *mat_sum_to(Matrix *C, Matrix *A, Matrix *B);
 Matrix *mat_diff_to(Matrix *C, Matrix *A, Matrix *B);

 //
 // Multiplication
 //

 static void mat_mul_impl_(float *restrict C, const float *restrict A, const float *restrict B,
                          int C_row_stride, int A_row_stride, int B_row_stride);

 Matrix *mat_strassen_mul_impl_(Matrix *C, Matrix *A, Matrix *B, const Matrix *T) {
    ++ strassen_entry_count;
    mat_use_check(C);
    mat_use_check(A);
    mat_use_check(B);
    if (T) mat_use_check(T);
    int const N = C->n;
    assert(N >= KERNEL_SIZE && N == A->n && N == B->n && (!T || N <= T->n));

    if (N == KERNEL_SIZE) {
        mat_mul_impl_(C->ptr, A->ptr, B->ptr, C->row_stride, A->row_stride, B->row_stride);
    } else {
        Matrix A11 = mat_block_view(A, 0, 0, 2);
        Matrix A12 = mat_block_view(A, 0, 1, 2);
        Matrix A21 = mat_block_view(A, 1, 0, 2);
        Matrix A22 = mat_block_view(A, 1, 1, 2);
        Matrix B11 = mat_block_view(B, 0, 0, 2);
    #if TRANSPOSE_B
        Matrix B12 = mat_block_view(B, 1, 0, 2); // transposed
        Matrix B21 = mat_block_view(B, 0, 1, 2); // transposed
    #else
        Matrix B12 = mat_block_view(B, 0, 1, 2);
        Matrix B21 = mat_block_view(B, 1, 0, 2);
    #endif
        Matrix B22 = mat_block_view(B, 1, 1, 2);
        Matrix C11 = mat_block_view(C, 0, 0, 2);
        Matrix C12 = mat_block_view(C, 0, 1, 2);
        Matrix C21 = mat_block_view(C, 1, 0, 2);
        Matrix C22 = mat_block_view(C, 1, 1, 2);

        //  T1 == C12, T2 == C21, T3,T4,T5 : new
        // C11 = (M7) = (A12-A22) * (B21+B22) // lease T3
        // C22 = (M6) = (A21-A11) * (B11+B12) // lease T3
        //  T4 = (M1) = (A11+A22) * (B11+B22) // lease T3
        // C11 =  M7 + M1
        // C22 =  M6 + M1

        // C12 = (M5) = (A11+A12) * B22 // lease T3
        // C11 =  M7 + M1 - M5
        // C21 = (M2) = (A21+A22) * B11 // lease T3
        // C22 =  M6 + M1 - M2

        //  T4 = (M3) = A11 * (B12-B22) // lease T3
        // C12 =  M5 + M3
        // C22 =  M6 + M1 - M2 + M3
        //  T4 = (M4) = A22 * (B21-B11) // lease T3
        // C11 =  M7 + M1 - M5 + M4
        // C21 =  M2 + M4

        Matrix T3_, T4_, T5_, *T3 = NULL, *T4 = NULL, *T5 = NULL;
        if (T) {
            T3_ = mat_linear_block_view(T, 0, 0, 2);            
            T4_ = mat_linear_block_view(T, 0, 1, 2);
            T5_ = mat_linear_block_view(T, 1, 0, 2);
            T3 = &T3_;
            T4 = &T4_;
            T5 = &T5_;
        } else {
            T3 = mat_create(A11.n);
            T4 = mat_create(A11.n);
            T5 = mat_create(A11.n);
        }
        {
        Matrix *M1 = &C12;
        /*M7*/ mat_strassen_mul_impl_(&C11, mat_diff_to(T4, &A12, &A22), mat_sum_to(T5, &B21, &B22), T3);
        /*M6*/ mat_strassen_mul_impl_(&C22, mat_diff_to(T4, &A21, &A11), mat_sum_to(T5, &B11, &B12), T3);
        /*M1*/ mat_strassen_mul_impl_(M1, mat_sum_to(T4, &A11, &A22), mat_sum_to(T5, &B11, &B22), T3);
        mat_add_to(&C11, M1);
        mat_add_to(&C22, M1);
        }
        {
        Matrix *M5 = mat_strassen_mul_impl_(&C12, mat_sum_to(T5, &A11, &A12), &B22, T3);
        mat_sub_from(&C11, M5);
        Matrix *M2 = mat_strassen_mul_impl_(&C21, mat_sum_to(T5, &A21, &A22), &B11, T3);
        mat_sub_from(&C22, M2);
        }
        {
        Matrix *M3 = mat_strassen_mul_impl_(T4, &A11, mat_diff_to(T5, &B12, &B22), T3);
        mat_add_to(&C12, M3);
        mat_add_to(&C22, M3);
        }
        {
        Matrix *M4 = mat_strassen_mul_impl_(T4, &A22, mat_diff_to(T5, &B21, &B11), T3);
        mat_add_to(&C11, M4);
        mat_add_to(&C21, M4);
        }
        free_all(&T3, &T4, &T5, NULL);
    }
    free_all_temp(A, B, T, NULL);
    return C;
 }

 static void unpack_row_major(float *const restrict B, const float *const restrict A, int const Ars);
 static void unpack_col_major(float *const restrict B, const float *const restrict A, int const Ars); 
 #if 0
 static void pack_row_major(float *const restrict B, const float *const restrict A, int const Brs);
 static void pack_col_major(float *const restrict B, const float *const restrict A, int const Brs);
 #endif

 static void mat_mul_impl_(float *restrict C, const float *restrict A, const float *restrict B,
                          int C_row_stride, int A_row_stride, int B_row_stride)
 {
    enum { N = KERNEL_SIZE };
    float AA[N*N], BB[N*N];

    unpack_row_major(AA, A, A_row_stride);

    if (TRANSPOSE_B)
        unpack_row_major(BB, B, B_row_stride);
    else
        unpack_col_major(BB, B, B_row_stride);

    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {//00
            float accum = 0;
            for (int k = 0; k < N; ++k) {
                accum += AA[i*N+k] * BB[j*N+k];
            }
            C[i*C_row_stride+j] = accum;
        }
    }
 }

 static void unpack_row_major(float *const restrict B, const float *const restrict A, int const Ars)
 {
    const int N = KERNEL_SIZE;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            B[i*N+j] = A[i*Ars+j];
 }

 static void unpack_col_major(float *const restrict B, const float *const restrict A, int const Ars) 
 {
    const int N = KERNEL_SIZE;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            B[i*N+j] = A[j*Ars+i];
 }

 #if 0
 static void pack_row_major(float *const restrict B, const float *const restrict A, int const Brs)
 {
    const int N = KERNEL_SIZE;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            B[i*Brs+j] = A[i*N+j];
 }

 static void pack_col_major(float *const restrict B, const float *const restrict A, int const Brs)
 {
    const int N = KERNEL_SIZE;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            B[j*Brs+i] = A[i*N+j];
 }
 #endif

 Matrix *mat_strassen_mul_to(Matrix *C, Matrix *A, Matrix *B) {
    mat_use_check(C);
    mat_use_check(A);
    mat_use_check(B);
    assert(C->n == A->n && C->n == B->n);
    assert(C->n >= KERNEL_SIZE);
    if (TRANSPOSE_B)
        mat_transpose(B);
    if (C->n <= 64) {
        printf("A\n");
        mat_print(A);
        printf("B\n");
        mat_print(B);
    }

    mat_strassen_mul_impl_(C, A, B, NULL);

    if (C->n <= 64) {
        printf("C\n");
        mat_print(C);
    }

    if (TRANSPOSE_B)
        mat_transpose(B);
    return C;
 }

 _nodiscard_ Matrix *mat_strassen_mul(Matrix *A, Matrix *B) {
    mat_use_check(A);
    mat_use_check(B);
    Matrix *C = mat_create(A->n);
    mat_strassen_mul_to(C, A, B);
    return C;
 }

 //
 // Addition/subtraction
 //

 Matrix *mat_sum_to(Matrix *C, Matrix *A, Matrix *B) {
    mat_use_check(C);
    mat_use_check(A);
    mat_use_check(B);
    assert(C->n == A->n && C->n == B->n);
    float *restrict c = C->ptr, * restrict a = A->ptr, * restrict b = B->ptr;
    int const N = A->n;
    int const Ars = A->row_stride, Brs = B->row_stride, Crs = C->row_stride;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j)
            c[i*Crs+j] = a[i*Ars+j] + b[i*Brs+j];
    }
    free_all_temp(A, B, NULL);
    return C;
 }

 _nodiscard_ Matrix *mat_sum(Matrix *A, Matrix *B) {
    return mat_sum_to(mat_create(A->n), A, B);
 }

 Matrix *mat_add_to(Matrix *C, Matrix *B) {
    mat_use_check(C);
    mat_use_check(B);
    assert(C->n == B->n);
    float *restrict c = C->ptr, *restrict b = B->ptr;
    int const N = C->n;
    int const Brs = B->row_stride, Crs = C->row_stride;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            c[i*Crs+j] += b[i*Brs+j];
    free_temp(B);
    return C;
 }

 Matrix *mat_diff_to(Matrix *C, Matrix *A, Matrix *B) {
    mat_use_check(C);
    mat_use_check(A);
    mat_use_check(B);
    assert(C->n == A->n && C->n == B->n);
    int const N = A->n, Ars = A->row_stride, Brs = B->row_stride, Crs = C->row_stride;
    float *restrict c = C->ptr, *restrict a = A->ptr, *restrict b = B->ptr;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            c[i*Crs+j] = a[i*Ars+j] - b[i*Brs+j];

    free_all_temp(A, B, NULL);
    return C;
 }

 _nodiscard_ Matrix *mat_diff(Matrix *A, Matrix *B) {
    return mat_diff_to(mat_create(A->n), A, B);
 }

 Matrix *mat_sub_from(Matrix *C, Matrix *B) {
    mat_use_check(C);
    mat_use_check(B);
    assert(C->n == B->n);
    float *restrict c = C->ptr, *restrict b = B->ptr;
    int const N = C->n, Brs = B->row_stride, Crs = C->row_stride;
    for (int i = 0; i < N; ++i)
        for (int j = 0; j < N; ++j)
            c[i*Crs+j] -= b[i*Brs+j];

    free_temp(B);
    return C;
 }

 //
 // Misc Value Setting
 //

 _nodiscard_ size_t mat_num_bytes(const Matrix *A) {
    mat_use_check(A);
    return A ? sizeof(*A) + sizeof(float) * A->n * A->row_stride : 0;
 }

 Matrix *mat_zero(Matrix *M) {
    mat_use_check(M);
    int const N = M->n;
    float *restrict m = M->ptr;
    for (int i = 0; i < N; ++i) {
        memset(m, 0, sizeof(float) * N);
        m += M->row_stride;
    }
    return M;
 }

 _nodiscard_ Matrix *mat_zeroed(int n) { return mat_zero(mat_create(n)); }

 Matrix *mat_randomize(Matrix *M) {
    mat_use_check(M);
    float *restrict m = M->ptr;
    const int N = M->n, Mrs = M->row_stride;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) 
            m[i*Mrs+j] = -1. + 2.*((float)rand())/RAND_MAX;
    }
    return M;
 }

 _nodiscard_ Matrix *mat_randomized(int n) { return mat_randomize(mat_create(n)); }

 Matrix *mat_row_seq(Matrix *M) {
    mat_use_check(M);
    mat_zero(M);
    float *restrict m = M->ptr;
    const int N = M->n;
    for (int i = 0; i < N; ++i)
        m[i] = i;
    return M;
 }

 Matrix *mat_col_seq(Matrix *M) {
    mat_use_check(M);
    mat_zero(M);
    float *restrict m = M->ptr;
    const int N = M->n, Mrs = M->row_stride;
    for (int i = 0; i < N; ++i)
        m[i*Mrs] = i;
    return M;
 }

 Matrix *mat_transpose(Matrix *M) {
    mat_use_check(M);
    const int N = M->n, Mrs = M->row_stride;
    float *const restrict m = M->ptr;
    for (int i = 0; i < N; ++i) {
        for (int j = i+1; j < N; ++j) {
            float a = m[i*Mrs+j];
            m[i*Mrs+j] = m[j*Mrs+i];
            m[j*Mrs+i] = a;
        }
    }
    return M;
 }

 Matrix *mat_copy_to(Matrix *M, Matrix *A) {
    mat_use_check(M);
    mat_use_check(A);
    assert(M->n == A->n);
    if (mat_same_layouts(M, A)) {
        memcpy(M->ptr, A->ptr, mat_num_bytes(M));
    } else {
        float *restrict m = M->ptr, *restrict a = A->ptr;
        int const N = M->n, Ars = A->row_stride, Mrs = M->row_stride;
        for (int i = 0; i < N; ++i)
            for (int j = 0; j < N; ++j)
                m[i*Mrs+j] = a[i*Ars+j];
    }
    free_temp(A);
    return M;
 }

 //
 // Matrix Creation/Destruction
 //

 //! A modifier used to pass a temporary matrix as a matrix argument - the
 //! called function will treat this matrix as a temporary one and free it when
 //! it returns.
 Matrix *temp(Matrix *M) {
    mat_use_check(M);
    assert(M->tmp_count >= 0);
    M->tmp_count ++;
    return M;
 }

 inline size_t mat_alloc_size(const int n) {
    return sizeof(Matrix) + sizeof(float) * n * n;
 }

 __attribute__((noreturn)) static void out_of_memory(void) {
    fprintf(stderr, "Out of memory\n");
    fflush(stderr);
    abort();
 }

 _nodiscard_ Matrix *mat_create(int const n) {
    size_t const bytes = mat_alloc_size(n);
    Matrix *const M = malloc(bytes);
    if (TRACK_ALLOCS) {
        printf("Allocating (%ld) %dx%d %p\n", obj_track(TE_COUNT, NULL, 0), n, n, M);    
        fflush(stdout);
    }
    if (!M) out_of_memory();
    bool ok = obj_track(TE_CREATE, M, bytes);
    assert(ok);

    M->shown = NULL;
    M->ptr = (void*)(M+1);
    M->n = n;
    M->row_stride = n;
    M->tmp_count = 0;
    return M;
 }

 void mat_free(Matrix **M) {
    Matrix *mp = *M;
    if (!mp || mp->shown) return;
    mat_use_check(mp);
    const size_t bytes = mat_alloc_size(mat_shown(mp)->n);
    if (TRACK_ALLOCS) {
        printf("Freeing %s (%ld) %dx%d %p\n",
            mp->shown ? "View" : "Matrix", obj_track(TE_COUNT, NULL, 0), 
            mp->n, mp->n, mp);
        fflush(stdout);
    }
    free(mp);
    bool ok = obj_track(TE_DESTROY, mp, bytes);
    assert(ok);
    *M = mp = NULL;
 }

 void free_all(Matrix **M, ...) {
    va_list args;
    va_start(args, M); 
    while (M) {
        mat_free(M);
        M = va_arg(args, Matrix **);
    }
    va_end(args);
 }

 void free_temp(Matrix *M) {
    if (!M) return;
    if (!M->tmp_count) return;
    assert(M->tmp_count > 0);
    if (!--M->tmp_count)
        mat_free(&M);
 }

 void free_all_temp(Matrix *M, ...) {
    va_list args;
    va_start(args, M);
    while(M) {
        free_temp(M);
        M = va_arg(args, Matrix *);
    }
    va_end(args);
 }

 //
 // Matrix Query and Output
 //

 static _nodiscard_ bool mat_same_layouts(const Matrix *A, const Matrix *B) {
    mat_use_check(A);
    mat_use_check(B);
    return A->n == B->n && A->row_stride == B->row_stride;
 }

 void mat_print(const Matrix *M) {
    mat_use_check(M);
    float *m = M->ptr;
    for (int i = 0; i < M->n; ++i) {
        for (int j = 0; j < M->n; ++j) printf("%.0f ", m[j]);
        printf("\n");
        m += M->row_stride;
    }
    fflush(stdout);
 }

 void mat_dump(const Matrix *M) {
    mat_use_check(M);
    if (!TRACK_DUMPS) return;
    if (!M) {
        printf("Null\n");
    } else {
        const char *kind = !M->shown ? "Matrix" : "View";
        printf("%s %dx%d <->%d %p", kind, M->n, M->n, M->row_stride, M->ptr);
        if (M->shown) printf(" ..%p", M->shown->ptr);
        printf("\n");
    }
    fflush(stdout);
 }

 //
 // Views of a Matrix
 //

 static void track_view(const Matrix *V, const char *kind) {
    if (TRACK_VIEWS) {
        printf("New %s %dx%d <->%d %p\n", kind, V->n, V->n, V->row_stride, V->ptr);
        fflush(stdout);
    }
 }

 //! Returns a sub-block *view* of a given matrix. The block's index is i,j (0-based),
 //! out of an nxn square of blocks. Thew view is by-value and is meant to be
 //! kept by value. It doesn't allocate.
 _nodiscard_ Matrix mat_block_view(const Matrix *M, int i, int j, int nbl) {
    mat_use_check(M);
    const Matrix *shown = mat_shown(M);

    Matrix view = { .shown = (Matrix*)shown };
    view.n = M->n / nbl;
    view.row_stride = M->row_stride;
    view.ptr = M->ptr + ((size_t)i * view.n * view.row_stride) + ((size_t)j * view.n);
    track_view(&view, "View");
    return view;
 }

 //! Returns a sub-block linearized view of a given matrix, i.e. the sub-blocks
 //! have the smallest possible row_stride. Useful for cache locality when reusing
 //! temporary allocations. The source matrix must be contiguous, i.e. it can't be
 //! a mat_block_view.
 _nodiscard_ Matrix mat_linear_block_view(const Matrix *M, int i, int j, int nbl) {
    mat_use_check(M);
    assert(M->row_stride == M->n);
    const Matrix *shown = mat_shown(M);

    Matrix view = { .shown = (Matrix*)shown };
    view.n = M->n / nbl;
    view.row_stride = view.n;
    view.ptr = M->ptr + ((size_t)i * nbl + (size_t)j) * view.n * view.n;
    track_view(&view, "Linear View");
    return view;
 }

 //
 // Example/Test
 //

 typedef struct timeval timeval;

 _nodiscard_ timeval get_time(void) {
    timeval result;
    gettimeofday(&result, 0);
    return result;
 }

 _nodiscard_ double time_delta(const timeval *start, const timeval *end) {
    double const t1 = start->tv_sec + start->tv_usec/1e6;
    double const t2 = end->tv_sec + end->tv_usec/1e6;
    return t2-t1;
 }

 int main()
 {
    size_t const N = 2048;
 #if 1
    Matrix *A = mat_randomize(mat_create(N));
    Matrix *B = mat_randomize(mat_create(N));
 #else
    Matrix *A = mat_row_seq(mat_create(N));
    Matrix *B = mat_col_seq(mat_create(N));
 #endif
    Matrix *C = mat_create(N);
    printf("Memory used for A,B,C matrices: %lu\n", obj_track(TE_ALLOC, NULL, 0));
    
    timeval start = get_time();
    mat_strassen_mul_to(C, A, B);
    timeval end = get_time();

    free_all(&C, &B, &A, NULL);

    printf("Number of entries to Strassen multiplication: %d\n", strassen_entry_count);
    printf("Total wall time: %gs\n", time_delta(&start, &end));
    printf("Maximum allocated matrices/memory: %lu / %lu\n", 
        obj_track(TE_MAX_COUNT, NULL, 0), obj_track(TE_MAX_ALLOC, NULL, 0));
    printf("Matrices left: %lu\n", obj_track(TE_COUNT, NULL, 0));
    assert(obj_track(TE_ISEMPTY, NULL, 0));
 }

 //
 // Diagnostic Object Tracking
 //

 #if TRACK_MEMORY_USE || TRACK_LIFETIME

 struct {
    const void *ptr;
 } typedef ObjEntry;

 struct {
    ObjEntry *objects;
    size_t capacity;
    size_t count;
    size_t alloc;
    size_t max_alloc;
    size_t max_count;
    bool is_sorted;
 } typedef ObjState;

 bool obj_count(const void *obj, size_t size, ObjState *ost);
 bool obj_uncount(const void *obj, size_t size, ObjState *ost);

 bool obj_push_back(const void *obj, size_t size, ObjState *ost);
 bool obj_find(const void *obj, ObjState *ost);
 bool obj_remove(const void *obj, size_t size, ObjState *ost);

 size_t obj_track(enum TrackEvent event, const void *obj, size_t param) {
    static ObjState ost;
    switch (event) {
 #if TRACK_MEMORY_USE && !TRACK_LIFETIME
    case TE_CREATE:
        return obj_count(obj, param, &ost);
    case TE_DESTROY:
        return obj_uncount(obj, param, &ost);
    case TE_USE:
        return !!obj;
 #else
    case TE_CREATE:
        return !obj_find(obj, &ost) && obj_push_back(obj, param, &ost);
    case TE_USE:
        return obj_find(obj, &ost);
    case TE_DESTROY:
        return obj_remove(obj, param, &ost);
 #endif
    case TE_ISEMPTY:
        return !ost.count;
    case TE_COUNT:
        return ost.count;
    case TE_ALLOC:
        return ost.alloc;
    case TE_MAX_COUNT:
        return ost.max_count;
    case TE_MAX_ALLOC:
        return ost.max_alloc;
    default:
        return false;
    }
 }

 bool obj_count(const void *obj, size_t size, ObjState *ost) {
    if (!obj || !size) return false;
    ++ost->count;
    ost->alloc += size;
    if (ost->count > ost->max_count) ost->max_count = ost->count;
    if (ost->alloc > ost->max_alloc) ost->max_alloc = ost->alloc;
    return true;
 }

 bool obj_uncount(const void *obj, size_t size, ObjState *ost) {
    if (!obj || !size) return false;
    --ost->count;
    ost->alloc -= size;
    return true;
 }

 bool obj_push_back(const void *obj, size_t size, ObjState *ost) {
    if (!ost->capacity) {
        ost->capacity = 32;
        ost->objects = malloc(sizeof(ObjEntry) * ost->capacity);
    }
    else if (ost->capacity == ost->count) {
        ost->capacity *= 2;
        ost->objects = realloc(ost->objects, sizeof(ObjEntry) * ost->capacity);
        if (!ost->objects) out_of_memory();
    }
    if (!obj_count(obj, size, ost))
        return false;
    ost->objects[ost->count-1] = (ObjEntry){ .ptr = obj };
    if (ost->count == 1) {
        ost->is_sorted = true;
    } else {
        ObjEntry *second_to_last = &(ost->objects[ost->count - 2]);
        ost->is_sorted = ost->is_sorted && obj > second_to_last->ptr;
    }
    return true;
 }

 static int ptr_comp(const void *a, const void *b) {
    const ObjEntry *obj1 = a;
    const ObjEntry *obj2 = b;
    ssize_t diff = obj1->ptr - obj2->ptr;
    return diff < 0 ? - 1 : diff > 0 ? 1 : 0;
 }

 int obj_lookup(const void *obj, ObjState *ost) {
    if (!ost->is_sorted) {
        qsort(ost->objects, ost->count, sizeof(ObjEntry), ptr_comp);
        ost->is_sorted = true;
    }
    if (ost->count > 1) {
        // Sanity check: the first two objects must be sorted, at least.
        assert(ost->objects[0].ptr < ost->objects[1].ptr);
    }
    const ObjEntry *found = 
        bsearch(&obj, ost->objects, ost->count, sizeof(ObjEntry), ptr_comp);
    return (!found) ? -1 : (found - ost->objects);
 }

 bool obj_find(const void *obj, ObjState *ost) { return obj_lookup(obj, ost) >= 0; }

 bool obj_erase(int pos, size_t size, ObjState *ost) {
    assert(pos >= -1 && pos < ost->count);
    if (pos == -1) return false;
    if (!obj_uncount(ost->objects[pos].ptr, size, ost))
        return false;
    if (pos < (ost->count))
        memmove(ost->objects + pos, ost->objects + pos + 1, 
                sizeof(ObjEntry) * (ost->count - pos));    
    return true;
 }

 bool obj_remove(const void *obj, size_t size, ObjState *ost) {
    int index = obj_lookup(obj, ost);
    return obj_erase(index, size, ost);
 }

 #endif // TRACK_MEMORY_USE || TRACK_LIFETIME
 // complete compileable example ends
	// complete compileable example begins
	#include <assert.h>
	#include <math.h>
	#include <stdarg.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/time.h>

	#define TRACK_MEMORY_USE 1
	#define TRACK_LIFETIME 0
	#define TRACK_VIEWS 0
	#define TRACK_DUMPS 0
	#define TRACK_ALLOCS 1

	#define KERNEL_SIZE 16 // 16-32 is the sweet spot usually
	#define TRANSPOSE_B 1 // enabling it improves things very slightly

	#if defined(__GNUC__) \|\| defined(__clang__)
	#define _nodiscard_ __attribute__((warn_unused_result))
	#else
	#define _nodiscard_
	#endif

	enum TrackEvent {
	TE_CREATE, // param = size of object
	TE_USE,
	TE_DESTROY, // param = size of object
	TE_ISEMPTY,
	TE_COUNT,
	TE_ALLOC,
	TE_MAX_COUNT,
	TE_MAX_ALLOC,
	};

	#if TRACK_MEMORY_USE \|\| TRACK_LIFETIME
	size_t obj_track(enum TrackEvent event, const void *obj, size_t param);
	#else
	size_t obj_track(enum TrackEvent event, const void *obj, size_t param) { return 1; }
	#endif

	#define mat_use_check(M) do { \
	/* A view of a view must still refer directly to the shown matrix. */ \
	assert(!M->shown \|\| !M->shown->shown); \
	assert(obj_track(TE_USE, mat_shown(M), 0)); \
	} while (0)

	struct Matrix {
	struct Matrix *shown; // a matrix being viewed, if any
	float *ptr;
	int n; // rows and columns in this range
	int row_stride; // distance between beginnings of rows (units: elements)
	int16_t tmp_count; // number of active temporary uses of this object
	} typedef Matrix;

	int strassen_entry_count;

	//! Returns the matrix being shown if M is a view, otherwise return M itself
	inline const Matrix mat_shown(const Matrix M) {
	return M->shown ? M->shown : M;
	}

	Matrix *mat_create(int n);
	void free_all(Matrix **M1, ...);
	void free_temp(Matrix *M);
	void free_all_temp(Matrix *M1, ...);
	static bool mat_same_layouts(const Matrix A, const Matrix B);
	void mat_print(const Matrix *M);

	Matrix mat_transpose(Matrix M);
	Matrix mat_block_view(const Matrix *matrix, int i, int j, int nbl);
	Matrix mat_linear_block_view(const Matrix *matrix, int i, int j, int nbl);

	Matrix mat_add_to(Matrix C, Matrix *A);
	Matrix mat_sub_from(Matrix C, Matrix *A);
	Matrix mat_sum_to(Matrix C, Matrix A, Matrix B);
	Matrix mat_diff_to(Matrix C, Matrix A, Matrix B);

	//
	// Multiplication
	//

	static void mat_mul_impl_(float restrict C, const float restrict A, const float *restrict B,
	int C_row_stride, int A_row_stride, int B_row_stride);

	Matrix mat_strassen_mul_impl_(Matrix C, Matrix A, Matrix B, const Matrix *T) {
	++ strassen_entry_count;
	mat_use_check(C);
	mat_use_check(A);
	mat_use_check(B);
	if (T) mat_use_check(T);
	int const N = C->n;
	assert(N >= KERNEL_SIZE && N == A->n && N == B->n && (!T \|\| N <= T->n));

	if (N == KERNEL_SIZE) {
	mat_mul_impl_(C->ptr, A->ptr, B->ptr, C->row_stride, A->row_stride, B->row_stride);
	} else {
	Matrix A11 = mat_block_view(A, 0, 0, 2);
	Matrix A12 = mat_block_view(A, 0, 1, 2);
	Matrix A21 = mat_block_view(A, 1, 0, 2);
	Matrix A22 = mat_block_view(A, 1, 1, 2);
	Matrix B11 = mat_block_view(B, 0, 0, 2);
	#if TRANSPOSE_B
	Matrix B12 = mat_block_view(B, 1, 0, 2); // transposed
	Matrix B21 = mat_block_view(B, 0, 1, 2); // transposed
	#else
	Matrix B12 = mat_block_view(B, 0, 1, 2);
	Matrix B21 = mat_block_view(B, 1, 0, 2);
	#endif
	Matrix B22 = mat_block_view(B, 1, 1, 2);
	Matrix C11 = mat_block_view(C, 0, 0, 2);
	Matrix C12 = mat_block_view(C, 0, 1, 2);
	Matrix C21 = mat_block_view(C, 1, 0, 2);
	Matrix C22 = mat_block_view(C, 1, 1, 2);

	// T1 == C12, T2 == C21, T3,T4,T5 : new
	// C11 = (M7) = (A12-A22) * (B21+B22) // lease T3
	// C22 = (M6) = (A21-A11) * (B11+B12) // lease T3
	// T4 = (M1) = (A11+A22) * (B11+B22) // lease T3
	// C11 = M7 + M1
	// C22 = M6 + M1

	// C12 = (M5) = (A11+A12) * B22 // lease T3
	// C11 = M7 + M1 - M5
	// C21 = (M2) = (A21+A22) * B11 // lease T3
	// C22 = M6 + M1 - M2

	// T4 = (M3) = A11 * (B12-B22) // lease T3
	// C12 = M5 + M3
	// C22 = M6 + M1 - M2 + M3
	// T4 = (M4) = A22 * (B21-B11) // lease T3
	// C11 = M7 + M1 - M5 + M4
	// C21 = M2 + M4

	Matrix T3_, T4_, T5_, T3 = NULL, T4 = NULL, *T5 = NULL;
	if (T) {
	T3_ = mat_linear_block_view(T, 0, 0, 2);
	T4_ = mat_linear_block_view(T, 0, 1, 2);
	T5_ = mat_linear_block_view(T, 1, 0, 2);
	T3 = &T3_;
	T4 = &T4_;
	T5 = &T5_;
	} else {
	T3 = mat_create(A11.n);
	T4 = mat_create(A11.n);
	T5 = mat_create(A11.n);
	}
	{
	Matrix *M1 = &C12;
	/M7/ mat_strassen_mul_impl_(&C11, mat_diff_to(T4, &A12, &A22), mat_sum_to(T5, &B21, &B22), T3);
	/M6/ mat_strassen_mul_impl_(&C22, mat_diff_to(T4, &A21, &A11), mat_sum_to(T5, &B11, &B12), T3);
	/M1/ mat_strassen_mul_impl_(M1, mat_sum_to(T4, &A11, &A22), mat_sum_to(T5, &B11, &B22), T3);
	mat_add_to(&C11, M1);
	mat_add_to(&C22, M1);
	}
	{
	Matrix *M5 = mat_strassen_mul_impl_(&C12, mat_sum_to(T5, &A11, &A12), &B22, T3);
	mat_sub_from(&C11, M5);
	Matrix *M2 = mat_strassen_mul_impl_(&C21, mat_sum_to(T5, &A21, &A22), &B11, T3);
	mat_sub_from(&C22, M2);
	}
	{
	Matrix *M3 = mat_strassen_mul_impl_(T4, &A11, mat_diff_to(T5, &B12, &B22), T3);
	mat_add_to(&C12, M3);
	mat_add_to(&C22, M3);
	}
	{
	Matrix *M4 = mat_strassen_mul_impl_(T4, &A22, mat_diff_to(T5, &B21, &B11), T3);
	mat_add_to(&C11, M4);
	mat_add_to(&C21, M4);
	}
	free_all(&T3, &T4, &T5, NULL);
	}
	free_all_temp(A, B, T, NULL);
	return C;
	}

	static void unpack_row_major(float const restrict B, const float const restrict A, int const Ars);
	static void unpack_col_major(float const restrict B, const float const restrict A, int const Ars);
	#if 0
	static void pack_row_major(float const restrict B, const float const restrict A, int const Brs);
	static void pack_col_major(float const restrict B, const float const restrict A, int const Brs);
	#endif

	static void mat_mul_impl_(float restrict C, const float restrict A, const float *restrict B,
	int C_row_stride, int A_row_stride, int B_row_stride)
	{
	enum { N = KERNEL_SIZE };
	float AA[NN], BB[NN];

	unpack_row_major(AA, A, A_row_stride);

	if (TRANSPOSE_B)
	unpack_row_major(BB, B, B_row_stride);
	else
	unpack_col_major(BB, B, B_row_stride);

	for (int i = 0; i < N; ++i) {
	for (int j = 0; j < N; ++j) {//00
	float accum = 0;
	for (int k = 0; k < N; ++k) {
	accum += AA[iN+k] BB[j*N+k];
	}
	C[i*C_row_stride+j] = accum;
	}
	}
	}

	static void unpack_row_major(float const restrict B, const float const restrict A, int const Ars)
	{
	const int N = KERNEL_SIZE;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	B[iN+j] = A[iArs+j];
	}

	static void unpack_col_major(float const restrict B, const float const restrict A, int const Ars)
	{
	const int N = KERNEL_SIZE;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	B[iN+j] = A[jArs+i];
	}

	#if 0
	static void pack_row_major(float const restrict B, const float const restrict A, int const Brs)
	{
	const int N = KERNEL_SIZE;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	B[iBrs+j] = A[iN+j];
	}

	static void pack_col_major(float const restrict B, const float const restrict A, int const Brs)
	{
	const int N = KERNEL_SIZE;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	B[jBrs+i] = A[iN+j];
	}
	#endif

	Matrix mat_strassen_mul_to(Matrix C, Matrix A, Matrix B) {
	mat_use_check(C);
	mat_use_check(A);
	mat_use_check(B);
	assert(C->n == A->n && C->n == B->n);
	assert(C->n >= KERNEL_SIZE);
	if (TRANSPOSE_B)
	mat_transpose(B);
	if (C->n <= 64) {
	printf("A\n");
	mat_print(A);
	printf("B\n");
	mat_print(B);
	}

	mat_strassen_mul_impl_(C, A, B, NULL);

	if (C->n <= 64) {
	printf("C\n");
	mat_print(C);
	}

	if (TRANSPOSE_B)
	mat_transpose(B);
	return C;
	}

	_nodiscard_ Matrix mat_strassen_mul(Matrix A, Matrix *B) {
	mat_use_check(A);
	mat_use_check(B);
	Matrix *C = mat_create(A->n);
	mat_strassen_mul_to(C, A, B);
	return C;
	}

	//
	// Addition/subtraction
	//

	Matrix mat_sum_to(Matrix C, Matrix A, Matrix B) {
	mat_use_check(C);
	mat_use_check(A);
	mat_use_check(B);
	assert(C->n == A->n && C->n == B->n);
	float restrict c = C->ptr, restrict a = A->ptr, * restrict b = B->ptr;
	int const N = A->n;
	int const Ars = A->row_stride, Brs = B->row_stride, Crs = C->row_stride;
	for (int i = 0; i < N; ++i) {
	for (int j = 0; j < N; ++j)
	c[iCrs+j] = a[iArs+j] + b[i*Brs+j];
	}
	free_all_temp(A, B, NULL);
	return C;
	}

	_nodiscard_ Matrix mat_sum(Matrix A, Matrix *B) {
	return mat_sum_to(mat_create(A->n), A, B);
	}

	Matrix mat_add_to(Matrix C, Matrix *B) {
	mat_use_check(C);
	mat_use_check(B);
	assert(C->n == B->n);
	float restrict c = C->ptr, restrict b = B->ptr;
	int const N = C->n;
	int const Brs = B->row_stride, Crs = C->row_stride;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	c[iCrs+j] += b[iBrs+j];
	free_temp(B);
	return C;
	}

	Matrix mat_diff_to(Matrix C, Matrix A, Matrix B) {
	mat_use_check(C);
	mat_use_check(A);
	mat_use_check(B);
	assert(C->n == A->n && C->n == B->n);
	int const N = A->n, Ars = A->row_stride, Brs = B->row_stride, Crs = C->row_stride;
	float restrict c = C->ptr, restrict a = A->ptr, *restrict b = B->ptr;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	c[iCrs+j] = a[iArs+j] - b[i*Brs+j];

	free_all_temp(A, B, NULL);
	return C;
	}

	_nodiscard_ Matrix mat_diff(Matrix A, Matrix *B) {
	return mat_diff_to(mat_create(A->n), A, B);
	}

	Matrix mat_sub_from(Matrix C, Matrix *B) {
	mat_use_check(C);
	mat_use_check(B);
	assert(C->n == B->n);
	float restrict c = C->ptr, restrict b = B->ptr;
	int const N = C->n, Brs = B->row_stride, Crs = C->row_stride;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	c[iCrs+j] -= b[iBrs+j];

	free_temp(B);
	return C;
	}

	//
	// Misc Value Setting
	//

	_nodiscard_ size_t mat_num_bytes(const Matrix *A) {
	mat_use_check(A);
	return A ? sizeof(A) + sizeof(float) A->n * A->row_stride : 0;
	}

	Matrix mat_zero(Matrix M) {
	mat_use_check(M);
	int const N = M->n;
	float *restrict m = M->ptr;
	for (int i = 0; i < N; ++i) {
	memset(m, 0, sizeof(float) * N);
	m += M->row_stride;
	}
	return M;
	}

	_nodiscard_ Matrix *mat_zeroed(int n) { return mat_zero(mat_create(n)); }

	Matrix mat_randomize(Matrix M) {
	mat_use_check(M);
	float *restrict m = M->ptr;
	const int N = M->n, Mrs = M->row_stride;
	for (int i = 0; i < N; ++i) {
	for (int j = 0; j < N; ++j)
	m[iMrs+j] = -1. + 2.((float)rand())/RAND_MAX;
	}
	return M;
	}

	_nodiscard_ Matrix *mat_randomized(int n) { return mat_randomize(mat_create(n)); }

	Matrix mat_row_seq(Matrix M) {
	mat_use_check(M);
	mat_zero(M);
	float *restrict m = M->ptr;
	const int N = M->n;
	for (int i = 0; i < N; ++i)
	m[i] = i;
	return M;
	}

	Matrix mat_col_seq(Matrix M) {
	mat_use_check(M);
	mat_zero(M);
	float *restrict m = M->ptr;
	const int N = M->n, Mrs = M->row_stride;
	for (int i = 0; i < N; ++i)
	m[i*Mrs] = i;
	return M;
	}

	Matrix mat_transpose(Matrix M) {
	mat_use_check(M);
	const int N = M->n, Mrs = M->row_stride;
	float *const restrict m = M->ptr;
	for (int i = 0; i < N; ++i) {
	for (int j = i+1; j < N; ++j) {
	float a = m[i*Mrs+j];
	m[iMrs+j] = m[jMrs+i];
	m[j*Mrs+i] = a;
	}
	}
	return M;
	}

	Matrix mat_copy_to(Matrix M, Matrix *A) {
	mat_use_check(M);
	mat_use_check(A);
	assert(M->n == A->n);
	if (mat_same_layouts(M, A)) {
	memcpy(M->ptr, A->ptr, mat_num_bytes(M));
	} else {
	float restrict m = M->ptr, restrict a = A->ptr;
	int const N = M->n, Ars = A->row_stride, Mrs = M->row_stride;
	for (int i = 0; i < N; ++i)
	for (int j = 0; j < N; ++j)
	m[iMrs+j] = a[iArs+j];
	}
	free_temp(A);
	return M;
	}

	//
	// Matrix Creation/Destruction
	//

	//! A modifier used to pass a temporary matrix as a matrix argument - the
	//! called function will treat this matrix as a temporary one and free it when
	//! it returns.
	Matrix temp(Matrix M) {
	mat_use_check(M);
	assert(M->tmp_count >= 0);
	M->tmp_count ++;
	return M;
	}

	inline size_t mat_alloc_size(const int n) {
	return sizeof(Matrix) + sizeof(float) * n * n;
	}

	__attribute__((noreturn)) static void out_of_memory(void) {
	fprintf(stderr, "Out of memory\n");
	fflush(stderr);
	abort();
	}

	_nodiscard_ Matrix *mat_create(int const n) {
	size_t const bytes = mat_alloc_size(n);
	Matrix *const M = malloc(bytes);
	if (TRACK_ALLOCS) {
	printf("Allocating (%ld) %dx%d %p\n", obj_track(TE_COUNT, NULL, 0), n, n, M);
	fflush(stdout);
	}
	if (!M) out_of_memory();
	bool ok = obj_track(TE_CREATE, M, bytes);
	assert(ok);

	M->shown = NULL;
	M->ptr = (void*)(M+1);
	M->n = n;
	M->row_stride = n;
	M->tmp_count = 0;
	return M;
	}

	void mat_free(Matrix **M) {
	Matrix mp = M;
	if (!mp \|\| mp->shown) return;
	mat_use_check(mp);
	const size_t bytes = mat_alloc_size(mat_shown(mp)->n);
	if (TRACK_ALLOCS) {
	printf("Freeing %s (%ld) %dx%d %p\n",
	mp->shown ? "View" : "Matrix", obj_track(TE_COUNT, NULL, 0),
	mp->n, mp->n, mp);
	fflush(stdout);
	}
	free(mp);
	bool ok = obj_track(TE_DESTROY, mp, bytes);
	assert(ok);
	*M = mp = NULL;
	}

	void free_all(Matrix **M, ...) {
	va_list args;
	va_start(args, M);
	while (M) {
	mat_free(M);
	M = va_arg(args, Matrix **);
	}
	va_end(args);
	}

	void free_temp(Matrix *M) {
	if (!M) return;
	if (!M->tmp_count) return;
	assert(M->tmp_count > 0);
	if (!--M->tmp_count)
	mat_free(&M);
	}

	void free_all_temp(Matrix *M, ...) {
	va_list args;
	va_start(args, M);
	while(M) {
	free_temp(M);
	M = va_arg(args, Matrix *);
	}
	va_end(args);
	}

	//
	// Matrix Query and Output
	//

	static _nodiscard_ bool mat_same_layouts(const Matrix A, const Matrix B) {
	mat_use_check(A);
	mat_use_check(B);
	return A->n == B->n && A->row_stride == B->row_stride;
	}

	void mat_print(const Matrix *M) {
	mat_use_check(M);
	float *m = M->ptr;
	for (int i = 0; i < M->n; ++i) {
	for (int j = 0; j < M->n; ++j) printf("%.0f ", m[j]);
	printf("\n");
	m += M->row_stride;
	}
	fflush(stdout);
	}

	void mat_dump(const Matrix *M) {
	mat_use_check(M);
	if (!TRACK_DUMPS) return;
	if (!M) {
	printf("Null\n");
	} else {
	const char *kind = !M->shown ? "Matrix" : "View";
	printf("%s %dx%d <->%d %p", kind, M->n, M->n, M->row_stride, M->ptr);
	if (M->shown) printf(" ..%p", M->shown->ptr);
	printf("\n");
	}
	fflush(stdout);
	}

	//
	// Views of a Matrix
	//

	static void track_view(const Matrix V, const char kind) {
	if (TRACK_VIEWS) {
	printf("New %s %dx%d <->%d %p\n", kind, V->n, V->n, V->row_stride, V->ptr);
	fflush(stdout);
	}
	}

	//! Returns a sub-block view of a given matrix. The block's index is i,j (0-based),
	//! out of an nxn square of blocks. Thew view is by-value and is meant to be
	//! kept by value. It doesn't allocate.
	_nodiscard_ Matrix mat_block_view(const Matrix *M, int i, int j, int nbl) {
	mat_use_check(M);
	const Matrix *shown = mat_shown(M);

	Matrix view = { .shown = (Matrix*)shown };
	view.n = M->n / nbl;
	view.row_stride = M->row_stride;
	view.ptr = M->ptr + ((size_t)i * view.n * view.row_stride) + ((size_t)j * view.n);
	track_view(&view, "View");
	return view;
	}

	//! Returns a sub-block linearized view of a given matrix, i.e. the sub-blocks
	//! have the smallest possible row_stride. Useful for cache locality when reusing
	//! temporary allocations. The source matrix must be contiguous, i.e. it can't be
	//! a mat_block_view.
	_nodiscard_ Matrix mat_linear_block_view(const Matrix *M, int i, int j, int nbl) {
	mat_use_check(M);
	assert(M->row_stride == M->n);
	const Matrix *shown = mat_shown(M);

	Matrix view = { .shown = (Matrix*)shown };
	view.n = M->n / nbl;
	view.row_stride = view.n;
	view.ptr = M->ptr + ((size_t)i * nbl + (size_t)j) * view.n * view.n;
	track_view(&view, "Linear View");
	return view;
	}

	//
	// Example/Test
	//

	typedef struct timeval timeval;

	_nodiscard_ timeval get_time(void) {
	timeval result;
	gettimeofday(&result, 0);
	return result;
	}

	_nodiscard_ double time_delta(const timeval start, const timeval end) {
	double const t1 = start->tv_sec + start->tv_usec/1e6;
	double const t2 = end->tv_sec + end->tv_usec/1e6;
	return t2-t1;
	}

	int main()
	{
	size_t const N = 2048;
	#if 1
	Matrix *A = mat_randomize(mat_create(N));
	Matrix *B = mat_randomize(mat_create(N));
	#else
	Matrix *A = mat_row_seq(mat_create(N));
	Matrix *B = mat_col_seq(mat_create(N));
	#endif
	Matrix *C = mat_create(N);
	printf("Memory used for A,B,C matrices: %lu\n", obj_track(TE_ALLOC, NULL, 0));

	timeval start = get_time();
	mat_strassen_mul_to(C, A, B);
	timeval end = get_time();

	free_all(&C, &B, &A, NULL);

	printf("Number of entries to Strassen multiplication: %d\n", strassen_entry_count);
	printf("Total wall time: %gs\n", time_delta(&start, &end));
	printf("Maximum allocated matrices/memory: %lu / %lu\n",
	obj_track(TE_MAX_COUNT, NULL, 0), obj_track(TE_MAX_ALLOC, NULL, 0));
	printf("Matrices left: %lu\n", obj_track(TE_COUNT, NULL, 0));
	assert(obj_track(TE_ISEMPTY, NULL, 0));
	}

	//
	// Diagnostic Object Tracking
	//

	#if TRACK_MEMORY_USE \|\| TRACK_LIFETIME

	struct {
	const void *ptr;
	} typedef ObjEntry;

	struct {
	ObjEntry *objects;
	size_t capacity;
	size_t count;
	size_t alloc;
	size_t max_alloc;
	size_t max_count;
	bool is_sorted;
	} typedef ObjState;

	bool obj_count(const void obj, size_t size, ObjState ost);
	bool obj_uncount(const void obj, size_t size, ObjState ost);

	bool obj_push_back(const void obj, size_t size, ObjState ost);
	bool obj_find(const void obj, ObjState ost);
	bool obj_remove(const void obj, size_t size, ObjState ost);

	size_t obj_track(enum TrackEvent event, const void *obj, size_t param) {
	static ObjState ost;
	switch (event) {
	#if TRACK_MEMORY_USE && !TRACK_LIFETIME
	case TE_CREATE:
	return obj_count(obj, param, &ost);
	case TE_DESTROY:
	return obj_uncount(obj, param, &ost);
	case TE_USE:
	return !!obj;
	#else
	case TE_CREATE:
	return !obj_find(obj, &ost) && obj_push_back(obj, param, &ost);
	case TE_USE:
	return obj_find(obj, &ost);
	case TE_DESTROY:
	return obj_remove(obj, param, &ost);
	#endif
	case TE_ISEMPTY:
	return !ost.count;
	case TE_COUNT:
	return ost.count;
	case TE_ALLOC:
	return ost.alloc;
	case TE_MAX_COUNT:
	return ost.max_count;
	case TE_MAX_ALLOC:
	return ost.max_alloc;
	default:
	return false;
	}
	}

	bool obj_count(const void obj, size_t size, ObjState ost) {
	if (!obj \|\| !size) return false;
	++ost->count;
	ost->alloc += size;
	if (ost->count > ost->max_count) ost->max_count = ost->count;
	if (ost->alloc > ost->max_alloc) ost->max_alloc = ost->alloc;
	return true;
	}

	bool obj_uncount(const void obj, size_t size, ObjState ost) {
	if (!obj \|\| !size) return false;
	--ost->count;
	ost->alloc -= size;
	return true;
	}

	bool obj_push_back(const void obj, size_t size, ObjState ost) {
	if (!ost->capacity) {
	ost->capacity = 32;
	ost->objects = malloc(sizeof(ObjEntry) * ost->capacity);
	}
	else if (ost->capacity == ost->count) {
	ost->capacity *= 2;
	ost->objects = realloc(ost->objects, sizeof(ObjEntry) * ost->capacity);
	if (!ost->objects) out_of_memory();
	}
	if (!obj_count(obj, size, ost))
	return false;
	ost->objects[ost->count-1] = (ObjEntry){ .ptr = obj };
	if (ost->count == 1) {
	ost->is_sorted = true;
	} else {
	ObjEntry *second_to_last = &(ost->objects[ost->count - 2]);
	ost->is_sorted = ost->is_sorted && obj > second_to_last->ptr;
	}
	return true;
	}

	static int ptr_comp(const void a, const void b) {
	const ObjEntry *obj1 = a;
	const ObjEntry *obj2 = b;
	ssize_t diff = obj1->ptr - obj2->ptr;
	return diff < 0 ? - 1 : diff > 0 ? 1 : 0;
	}

	int obj_lookup(const void obj, ObjState ost) {
	if (!ost->is_sorted) {
	qsort(ost->objects, ost->count, sizeof(ObjEntry), ptr_comp);
	ost->is_sorted = true;
	}
	if (ost->count > 1) {
	// Sanity check: the first two objects must be sorted, at least.
	assert(ost->objects[0].ptr < ost->objects[1].ptr);
	}
	const ObjEntry *found =
	bsearch(&obj, ost->objects, ost->count, sizeof(ObjEntry), ptr_comp);
	return (!found) ? -1 : (found - ost->objects);
	}

	bool obj_find(const void obj, ObjState ost) { return obj_lookup(obj, ost) >= 0; }

	bool obj_erase(int pos, size_t size, ObjState *ost) {
	assert(pos >= -1 && pos < ost->count);
	if (pos == -1) return false;
	if (!obj_uncount(ost->objects[pos].ptr, size, ost))
	return false;
	if (pos < (ost->count))
	memmove(ost->objects + pos, ost->objects + pos + 1,
	sizeof(ObjEntry) * (ost->count - pos));
	return true;
	}

	bool obj_remove(const void obj, size_t size, ObjState ost) {
	int index = obj_lookup(obj, ost);
	return obj_erase(index, size, ost);
	}

	#endif // TRACK_MEMORY_USE \|\| TRACK_LIFETIME
	// complete compileable example ends