brandonwillard · May 14, 2024 22:40
diff --git a/both-opt-max-reduce-axis-1.pdf b/both-opt-max-reduce-axis-1.pdf
diff --git a/both-unopt-max-reduce-axis-1.pdf b/both-unopt-max-reduce-axis-1.pdf
diff --git a/clang-comparison.md b/clang-comparison.md
diff --git a/numba-careduce-mwe.py b/numba-careduce-mwe.py
 from contextlib import contextmanager

 import numba
 import numpy as np


 X = np.random.normal(size=(5000, 6000))

 #
 # This is what we're trying to implement in Numba:
 #
 numpy_res = np.max(X, axis=1)

 # This is what we're trying to match/beat:
 # %timeit np.max(X, axis=1)
 # 15.6 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

 #
 # NOTE: We do *not* want to enable `parallel=True`, `fastmath=True`, or any
 # other options that are neither generalizable nor used by NumPy.
 #

 @numba.njit(inline="always")
 def custom_max(x, y):
    if x > y:
        return x
    else:
        return y


 #
 # Case 1: Using `vectorize` along the reduced dimension
 #
 @numba.vectorize(["float64(float64, float64)"])
 def vectorized_max(x, y):
    return custom_max(x, y)


 @numba.njit
 def vectorized_max_reduce_axis_1(x):
    res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
    x_transpose = np.transpose(x)
    for m in range(x.shape[1]):
        vectorized_max(res, x_transpose[m], res)

    return res


 # Confirm that it works
 assert np.array_equal(numpy_res, vectorized_max_reduce_axis_1(X))

 # %timeit vectorized_max_reduce_axis_1(X)
 # 94.2 ms ± 435 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

 #
 # Case 2: Manually written Numba reduction loops
 #
 @numba.njit
 def manual_max_reduce_axis_1(x):
    res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            res[i] = custom_max(res[i], x[i, j])
    return res


 assert np.array_equal(numpy_res, manual_max_reduce_axis_1(X))

 # %timeit manual_max_reduce_axis_1(X)
 # 38.3 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


 #
 # Case 3: Calls to NumPy's `ufunc.reduce` via `objmode`
 #
 @numba.njit
 def objmode_max_reduce_axis_1(x):
    with numba.objmode(res="float64[:]"):
        res = vectorized_max.reduce(x, axis=1)
    return res


 assert np.array_equal(numpy_res, objmode_max_reduce_axis_1(X))

 # %timeit objmode_max_reduce_axis_1(X)
 # 73.7 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


 #
 # Case 4: Recompile Case 2 with more LLVM optimizations in the "cheap" pass
 #
 # This comes from the discussions in
 # https://numba.discourse.group/t/numba-performance-doesnt-scale-as-well-as-numpy-in-vectorized-max-function/782
 #
 @contextmanager
 def use_optimized_cheap_pass(*args, **kwargs):
    """Temporarily replace the cheap optimization pass with a better one."""
    from numba.core.registry import cpu_target

    context = cpu_target.target_context._internal_codegen
    old_pm = context._mpm_cheap
    new_pm = context._module_pass_manager(
        loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
    )
    context._mpm_cheap = new_pm
    try:
        yield
    finally:
        context._mpm_cheap = old_pm


 with use_optimized_cheap_pass():

    @numba.njit
    def opt_manual_max_reduce_axis_1(x):
        res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                res[i] = custom_max(res[i], x[i, j])
        return res

    assert np.array_equal(numpy_res, opt_manual_max_reduce_axis_1(X))

 # %timeit opt_manual_max_reduce_axis_1(X)
 # 37.6 ms ± 872 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


 #
 # Case 5: This is Stuart's original form of the reduction
 #
 # Apparently, removing the use of `custom_max` and manually "in-lining" the
 # `max` operation makes a large difference; however, we can't reasonably do
 # this for every binary function that will perform a reduction.
 #
 with use_optimized_cheap_pass():

    @numba.njit
    def orig_opt_max_reduce_axis_1(x):
        res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                tmp = x[i, j]
                if res[i] < tmp:
                    res[i] = tmp
        return res

    assert np.array_equal(numpy_res, orig_opt_max_reduce_axis_1(X))

 # %timeit orig_opt_max_reduce_axis_1(X)
 # 20.3 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

 #
 # Case 5: Use an in-place version of `custom_max` 
 #
 with use_optimized_cheap_pass():

    @numba.njit
    def custom_max(res, i, y):
        if res[i] < y:
            res[i] = y

    @numba.njit
    def inplace_opt_max_reduce_axis_1(x):
        res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                custom_max(res, i, x[i, j])
        return res

    assert np.array_equal(numpy_res, inplace_opt_max_reduce_axis_1(X))

 # %timeit inplace_opt_max_reduce_axis_1(X)
 # 19.5 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

diff --git a/produce-cfg-plots.py b/produce-cfg-plots.py
 from contextlib import contextmanager, suppress

 import numba
 import numpy as np


 numba.config.DEBUGINFO_DEFAULT = 1

 X = np.random.normal(size=(5000, 6000))

 numpy_res = np.max(X, axis=1)


 @contextmanager
 def use_optimized_cheap_pass(*args, **kwargs):
    """Temporarily replace the cheap optimization pass with a better one."""
    from numba.core.registry import cpu_target

    context = cpu_target.target_context._internal_codegen
    old_pm = context._mpm_cheap
    new_pm = context._module_pass_manager(
        loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
    )
    context._mpm_cheap = new_pm
    try:
        yield
    finally:
        context._mpm_cheap = old_pm


 for opt_type in ["unopt", "opt"]:

    cm = suppress if opt_type == "unopt" else use_optimized_cheap_pass
    numba.config.OPT = 0 if opt_type == "unopt" else 3

    @numba.njit(inline="always")
    def custom_max(x, y):
        if x > y:
            return x
        else:
            return y

    with cm():

        @numba.njit
        def slow_max_reduce_axis_1(x):
            res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
            for i in range(x.shape[0]):
                for j in range(x.shape[1]):
                    res[i] = custom_max(res[i], x[i, j])
            return res

        assert np.array_equal(numpy_res, slow_max_reduce_axis_1(X))

    res = slow_max_reduce_axis_1.inspect_cfg(
        slow_max_reduce_axis_1.signatures[0],
        strip_ir=False,
        interleave=True,
        fontsize=10,
    )
    _ = res.display(filename=f"{opt_type}-slow-max-reduce-axis-1", format="pdf")

    with cm():

        @numba.njit
        def fast_max_reduce_axis_1(x):
            res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
            for i in range(x.shape[0]):
                for j in range(x.shape[1]):
                    tmp = x[i, j]
                    if res[i] < tmp:
                        res[i] = tmp
            return res

        assert np.array_equal(numpy_res, fast_max_reduce_axis_1(X))

    res = fast_max_reduce_axis_1.inspect_cfg(
        fast_max_reduce_axis_1.signatures[0],
        strip_ir=False,
        interleave=True,
        fontsize=10,
    )
    _ = res.display(filename=f"{opt_type}-fast-max-reduce-axis-1", format="pdf")
	from contextlib import contextmanager

	import numba
	import numpy as np


	X = np.random.normal(size=(5000, 6000))

	#
	# This is what we're trying to implement in Numba:
	#
	numpy_res = np.max(X, axis=1)

	# This is what we're trying to match/beat:
	# %timeit np.max(X, axis=1)
	# 15.6 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

	#
	# NOTE: We do not want to enable `parallel=True`, `fastmath=True`, or any
	# other options that are neither generalizable nor used by NumPy.
	#

	@numba.njit(inline="always")
	def custom_max(x, y):
	if x > y:
	return x
	else:
	return y


	#
	# Case 1: Using `vectorize` along the reduced dimension
	#
	@numba.vectorize(["float64(float64, float64)"])
	def vectorized_max(x, y):
	return custom_max(x, y)


	@numba.njit
	def vectorized_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	x_transpose = np.transpose(x)
	for m in range(x.shape[1]):
	vectorized_max(res, x_transpose[m], res)

	return res


	# Confirm that it works
	assert np.array_equal(numpy_res, vectorized_max_reduce_axis_1(X))

	# %timeit vectorized_max_reduce_axis_1(X)
	# 94.2 ms ± 435 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

	#
	# Case 2: Manually written Numba reduction loops
	#
	@numba.njit
	def manual_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	res[i] = custom_max(res[i], x[i, j])
	return res


	assert np.array_equal(numpy_res, manual_max_reduce_axis_1(X))

	# %timeit manual_max_reduce_axis_1(X)
	# 38.3 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


	#
	# Case 3: Calls to NumPy's `ufunc.reduce` via `objmode`
	#
	@numba.njit
	def objmode_max_reduce_axis_1(x):
	with numba.objmode(res="float64[:]"):
	res = vectorized_max.reduce(x, axis=1)
	return res


	assert np.array_equal(numpy_res, objmode_max_reduce_axis_1(X))

	# %timeit objmode_max_reduce_axis_1(X)
	# 73.7 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


	#
	# Case 4: Recompile Case 2 with more LLVM optimizations in the "cheap" pass
	#
	# This comes from the discussions in
	# https://numba.discourse.group/t/numba-performance-doesnt-scale-as-well-as-numpy-in-vectorized-max-function/782
	#
	@contextmanager
	def use_optimized_cheap_pass(args, *kwargs):
	"""Temporarily replace the cheap optimization pass with a better one."""
	from numba.core.registry import cpu_target

	context = cpu_target.target_context._internal_codegen
	old_pm = context._mpm_cheap
	new_pm = context._module_pass_manager(
	loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
	)
	context._mpm_cheap = new_pm
	try:
	yield
	finally:
	context._mpm_cheap = old_pm


	with use_optimized_cheap_pass():

	@numba.njit
	def opt_manual_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	res[i] = custom_max(res[i], x[i, j])
	return res

	assert np.array_equal(numpy_res, opt_manual_max_reduce_axis_1(X))

	# %timeit opt_manual_max_reduce_axis_1(X)
	# 37.6 ms ± 872 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


	#
	# Case 5: This is Stuart's original form of the reduction
	#
	# Apparently, removing the use of `custom_max` and manually "in-lining" the
	# `max` operation makes a large difference; however, we can't reasonably do
	# this for every binary function that will perform a reduction.
	#
	with use_optimized_cheap_pass():

	@numba.njit
	def orig_opt_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	tmp = x[i, j]
	if res[i] < tmp:
	res[i] = tmp
	return res

	assert np.array_equal(numpy_res, orig_opt_max_reduce_axis_1(X))

	# %timeit orig_opt_max_reduce_axis_1(X)
	# 20.3 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

	#
	# Case 5: Use an in-place version of `custom_max`
	#
	with use_optimized_cheap_pass():

	@numba.njit
	def custom_max(res, i, y):
	if res[i] < y:
	res[i] = y

	@numba.njit
	def inplace_opt_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	custom_max(res, i, x[i, j])
	return res

	assert np.array_equal(numpy_res, inplace_opt_max_reduce_axis_1(X))

	# %timeit inplace_opt_max_reduce_axis_1(X)
	# 19.5 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
	from contextlib import contextmanager, suppress

	import numba
	import numpy as np


	numba.config.DEBUGINFO_DEFAULT = 1

	X = np.random.normal(size=(5000, 6000))

	numpy_res = np.max(X, axis=1)


	@contextmanager
	def use_optimized_cheap_pass(args, *kwargs):
	"""Temporarily replace the cheap optimization pass with a better one."""
	from numba.core.registry import cpu_target

	context = cpu_target.target_context._internal_codegen
	old_pm = context._mpm_cheap
	new_pm = context._module_pass_manager(
	loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
	)
	context._mpm_cheap = new_pm
	try:
	yield
	finally:
	context._mpm_cheap = old_pm


	for opt_type in ["unopt", "opt"]:

	cm = suppress if opt_type == "unopt" else use_optimized_cheap_pass
	numba.config.OPT = 0 if opt_type == "unopt" else 3

	@numba.njit(inline="always")
	def custom_max(x, y):
	if x > y:
	return x
	else:
	return y

	with cm():

	@numba.njit
	def slow_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	res[i] = custom_max(res[i], x[i, j])
	return res

	assert np.array_equal(numpy_res, slow_max_reduce_axis_1(X))

	res = slow_max_reduce_axis_1.inspect_cfg(
	slow_max_reduce_axis_1.signatures[0],
	strip_ir=False,
	interleave=True,
	fontsize=10,
	)
	_ = res.display(filename=f"{opt_type}-slow-max-reduce-axis-1", format="pdf")

	with cm():

	@numba.njit
	def fast_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	tmp = x[i, j]
	if res[i] < tmp:
	res[i] = tmp
	return res

	assert np.array_equal(numpy_res, fast_max_reduce_axis_1(X))

	res = fast_max_reduce_axis_1.inspect_cfg(
	fast_max_reduce_axis_1.signatures[0],
	strip_ir=False,
	interleave=True,
	fontsize=10,
	)
	_ = res.display(filename=f"{opt_type}-fast-max-reduce-axis-1", format="pdf")