Skip to content

Instantly share code, notes, and snippets.

@brandonwillard
Last active May 14, 2024 22:40
Show Gist options
  • Save brandonwillard/0302f061fc9f89a65b5517170347a6f0 to your computer and use it in GitHub Desktop.
Save brandonwillard/0302f061fc9f89a65b5517170347a6f0 to your computer and use it in GitHub Desktop.
Numba CAReduce Performance MWE
Display the source blob
Display the rendered blob
Raw
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

const int NROWS = 5000;
const int NCOLS = 6000;


inline double custom_max(double x, double y) {
  if (x < y) {
    return y;
  } else {
    return x;
  }
}

double * max_reduce_axis_1(double ** x) {

  double * res = (double *) malloc(NROWS * sizeof(double));
  for (int i=0; i < NROWS; i++)
    res[i] = -INFINITY;

  if (res == NULL)
    exit(1);

  for (int i=0; i < NROWS; i++) {
    for (int j=0; j < NCOLS; j++) {
      res[i] = custom_max(res[i], x[i][j]);
    }
  }

  return res;
}

inline void custom_max_inplace(double *res, int i, double y) {
  if (res[i] < y)
    res[i] = y;
}

double * max_reduce_axis_1_inplace(double ** x) {

  double * res = (double *) malloc(NROWS * sizeof(double));
  for (int i=0; i < NROWS; i++)
    res[i] = -INFINITY;

  if (res == NULL)
    exit(1);

  for (int i=0; i < NROWS; i++) {
    for (int j=0; j < NCOLS; j++) {
      custom_max_inplace(res, i, x[i][j]);
    }
  }

  return res;
}

double * max_reduce_axis_1_nocall(double ** x) {

  double * res = (double *) malloc(NROWS * sizeof(double));
  for (int i=0; i < NROWS; i++)
    res[i] = -INFINITY;

  if (res == NULL)
    exit(1);

  for (int i=0; i < NROWS; i++) {
    for (int j=0; j < NCOLS; j++) {
      double tmp = x[i][j];
      if (res[i] < tmp)
        res[i] = tmp;
    }
  }

  return res;
}

void simulate_data(double ** x) {
  for(int i = 0; i < NROWS; i++) {
    for(int j = 0; j < NCOLS; j++) {
      float val = (float)(rand() % 10);
      x[i][j] = val;
    }
  }
}


int main() {

  srand(time(NULL));

  double ** x;

  x = malloc(NROWS * sizeof(double *));

  if (x == NULL)
    exit(1);

  for(int i = 0; i < NROWS; i++) {
    x[i] = malloc(NCOLS * sizeof(double));

    if (x == NULL)
      exit(1);
  }

  printf("call\tcall-inplace\tnocall");
  for (int i = 0; i < 10; i++) {
    simulate_data(x);

    clock_t begin = clock();
    double *res_1 = max_reduce_axis_1(x);
    clock_t end = clock();
    double call_time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    begin = clock();
    double *res_2 = max_reduce_axis_1_inplace(x);
    end = clock();
    double call_inplace_time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    begin = clock();
    double *res_3 = max_reduce_axis_1_nocall(x);
    end = clock();
    double no_call_time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    printf("\n%f\t%f\t%f", call_time_spent, call_inplace_time_spent, no_call_time_spent);

    for(int i = 0; i < NROWS; i++) {
      if (res_1[i] != res_2[i] || res_1[i] != res_3[i])
        perror("results not equal");
    }
    free((void *) res_1);
    free((void *) res_2);
    free((void *) res_3);
  }


  // Clean up
  for(int i = 0; i < NROWS; i++) {
    free((void *) x[i]);
  }
  free((void *) x);
}
clang -v -O3 -g max_reduce_axis.c -o max_reduce_axis_test
clang version 10.0.0-4ubuntu1
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/10
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/7.5.0
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/8
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/9
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/10
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
Selected GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/10
Candidate multilib: .;@m64
Selected multilib: .;@m64
 "/usr/lib/llvm-10/bin/clang" -cc1 -triple x86_64-pc-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name max_reduce_axis.c -mrelocation-model static -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -v -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdebug-compilation-dir /tmp -ferror-limit 19 -fmessage-length 231 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -faddrsig -o /tmp/user/1000/max_reduce_axis-4a51bb.o -x c max_reduce_axis.c
clang -cc1 version 10.0.0 based upon LLVM 10.0.0 default target x86_64-pc-linux-gnu
ignoring nonexistent directory "/include"
include "..." search starts here:
include <...> search starts here:
 /usr/local/include
 /usr/lib/llvm-10/lib/clang/10.0.0/include
 /usr/include/x86_64-linux-gnu
 /usr/include
End of search list.
 "/usr/bin/ld" -z relro --hash-style=gnu --build-id --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o max_reduce_axis_test /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crt1.o /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crti.o /usr/bin/../lib/gcc/x86_64-linux-gnu/10/crtbegin.o -L/usr/bin/../lib/gcc/x86_64-linux-gnu/10 -L/usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../.. -L/usr/lib/llvm-10/bin/../lib -L/lib -L/usr/lib /tmp/user/1000/max_reduce_axis-4a51bb.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/bin/../lib/gcc/x86_64-linux-gnu/10/crtend.o /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crtn.o
./max_reduce_axis_test
call	call-inplace	nocall
0.033317	0.019050	0.019931
0.033618	0.019099	0.019679
0.032939	0.018802	0.019675
0.032866	0.018517	0.019641
0.032872	0.018449	0.019533
0.032871	0.018405	0.019492
0.032842	0.018149	0.019479
0.032894	0.018311	0.019516
0.033016	0.018266	0.019554
from contextlib import contextmanager
import numba
import numpy as np
X = np.random.normal(size=(5000, 6000))
#
# This is what we're trying to implement in Numba:
#
numpy_res = np.max(X, axis=1)
# This is what we're trying to match/beat:
# %timeit np.max(X, axis=1)
# 15.6 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#
# NOTE: We do *not* want to enable `parallel=True`, `fastmath=True`, or any
# other options that are neither generalizable nor used by NumPy.
#
@numba.njit(inline="always")
def custom_max(x, y):
if x > y:
return x
else:
return y
#
# Case 1: Using `vectorize` along the reduced dimension
#
@numba.vectorize(["float64(float64, float64)"])
def vectorized_max(x, y):
return custom_max(x, y)
@numba.njit
def vectorized_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
x_transpose = np.transpose(x)
for m in range(x.shape[1]):
vectorized_max(res, x_transpose[m], res)
return res
# Confirm that it works
assert np.array_equal(numpy_res, vectorized_max_reduce_axis_1(X))
# %timeit vectorized_max_reduce_axis_1(X)
# 94.2 ms ± 435 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#
# Case 2: Manually written Numba reduction loops
#
@numba.njit
def manual_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
res[i] = custom_max(res[i], x[i, j])
return res
assert np.array_equal(numpy_res, manual_max_reduce_axis_1(X))
# %timeit manual_max_reduce_axis_1(X)
# 38.3 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
#
# Case 3: Calls to NumPy's `ufunc.reduce` via `objmode`
#
@numba.njit
def objmode_max_reduce_axis_1(x):
with numba.objmode(res="float64[:]"):
res = vectorized_max.reduce(x, axis=1)
return res
assert np.array_equal(numpy_res, objmode_max_reduce_axis_1(X))
# %timeit objmode_max_reduce_axis_1(X)
# 73.7 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#
# Case 4: Recompile Case 2 with more LLVM optimizations in the "cheap" pass
#
# This comes from the discussions in
# https://numba.discourse.group/t/numba-performance-doesnt-scale-as-well-as-numpy-in-vectorized-max-function/782
#
@contextmanager
def use_optimized_cheap_pass(*args, **kwargs):
"""Temporarily replace the cheap optimization pass with a better one."""
from numba.core.registry import cpu_target
context = cpu_target.target_context._internal_codegen
old_pm = context._mpm_cheap
new_pm = context._module_pass_manager(
loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
)
context._mpm_cheap = new_pm
try:
yield
finally:
context._mpm_cheap = old_pm
with use_optimized_cheap_pass():
@numba.njit
def opt_manual_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
res[i] = custom_max(res[i], x[i, j])
return res
assert np.array_equal(numpy_res, opt_manual_max_reduce_axis_1(X))
# %timeit opt_manual_max_reduce_axis_1(X)
# 37.6 ms ± 872 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#
# Case 5: This is Stuart's original form of the reduction
#
# Apparently, removing the use of `custom_max` and manually "in-lining" the
# `max` operation makes a large difference; however, we can't reasonably do
# this for every binary function that will perform a reduction.
#
with use_optimized_cheap_pass():
@numba.njit
def orig_opt_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
tmp = x[i, j]
if res[i] < tmp:
res[i] = tmp
return res
assert np.array_equal(numpy_res, orig_opt_max_reduce_axis_1(X))
# %timeit orig_opt_max_reduce_axis_1(X)
# 20.3 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#
# Case 5: Use an in-place version of `custom_max`
#
with use_optimized_cheap_pass():
@numba.njit
def custom_max(res, i, y):
if res[i] < y:
res[i] = y
@numba.njit
def inplace_opt_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
custom_max(res, i, x[i, j])
return res
assert np.array_equal(numpy_res, inplace_opt_max_reduce_axis_1(X))
# %timeit inplace_opt_max_reduce_axis_1(X)
# 19.5 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
from contextlib import contextmanager, suppress
import numba
import numpy as np
numba.config.DEBUGINFO_DEFAULT = 1
X = np.random.normal(size=(5000, 6000))
numpy_res = np.max(X, axis=1)
@contextmanager
def use_optimized_cheap_pass(*args, **kwargs):
"""Temporarily replace the cheap optimization pass with a better one."""
from numba.core.registry import cpu_target
context = cpu_target.target_context._internal_codegen
old_pm = context._mpm_cheap
new_pm = context._module_pass_manager(
loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
)
context._mpm_cheap = new_pm
try:
yield
finally:
context._mpm_cheap = old_pm
for opt_type in ["unopt", "opt"]:
cm = suppress if opt_type == "unopt" else use_optimized_cheap_pass
numba.config.OPT = 0 if opt_type == "unopt" else 3
@numba.njit(inline="always")
def custom_max(x, y):
if x > y:
return x
else:
return y
with cm():
@numba.njit
def slow_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
res[i] = custom_max(res[i], x[i, j])
return res
assert np.array_equal(numpy_res, slow_max_reduce_axis_1(X))
res = slow_max_reduce_axis_1.inspect_cfg(
slow_max_reduce_axis_1.signatures[0],
strip_ir=False,
interleave=True,
fontsize=10,
)
_ = res.display(filename=f"{opt_type}-slow-max-reduce-axis-1", format="pdf")
with cm():
@numba.njit
def fast_max_reduce_axis_1(x):
res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
tmp = x[i, j]
if res[i] < tmp:
res[i] = tmp
return res
assert np.array_equal(numpy_res, fast_max_reduce_axis_1(X))
res = fast_max_reduce_axis_1.inspect_cfg(
fast_max_reduce_axis_1.signatures[0],
strip_ir=False,
interleave=True,
fontsize=10,
)
_ = res.display(filename=f"{opt_type}-fast-max-reduce-axis-1", format="pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment