pervognsen · January 7, 2021 11:10
diff --git a/flame_cholesky.py b/flame_cholesky.py
 def blocks(A, block_size=(1, 1)):
    i, j = block_size
    while len(A):
        yield A[:i, :j], A[:i, j:], A[i:, :j], A[i:, j:]
        A = A[i:, j:]

 # 2400 ms for 1000x1000 matrix (~0.4 GFLOPS). Equivalent C code is only twice as fast (~0.8 GFLOPS).
 # The reason the C code isn't much faster is that it's an O(n^3) algorithm and most of the time is spent in
 # the O(n^2) kernel routine for the outer product in A11[:] -= A10 @ A10.T. Even if we posit that Python
 # is 1000x slower than C for the outer loop, that's still 1000n + n^3 vs n^3, which is negligible for n = 1000.
 def basic_cholesky(A):
    for A00, A01, A10, A11 in blocks(A):
        A00[:] = np.sqrt(A00)
        A01[:] = 0
        A10[:] /= A00
        A11[:] -= A10 @ A10.T

 # 75 ms for 1000x1000 matrix (~12.5 GFLOPS) versus 25 ms (~40 GFLOPS) for np.linalg.cholesky which uses LAPACK.
 def blocked_cholesky(A, block_size=(64, 64)):
    for A00, A01, A10, A11 in blocks(A, block_size):
        basic_cholesky(A00)
        A01[:] = 0
        lower_triangular_solve(A00, A10.T)
        A11[:] -= A10 @ A10.T
	def blocks(A, block_size=(1, 1)):
	i, j = block_size
	while len(A):
	yield A[:i, :j], A[:i, j:], A[i:, :j], A[i:, j:]
	A = A[i:, j:]

	# 2400 ms for 1000x1000 matrix (~0.4 GFLOPS). Equivalent C code is only twice as fast (~0.8 GFLOPS).
	# The reason the C code isn't much faster is that it's an O(n^3) algorithm and most of the time is spent in
	# the O(n^2) kernel routine for the outer product in A11[:] -= A10 @ A10.T. Even if we posit that Python
	# is 1000x slower than C for the outer loop, that's still 1000n + n^3 vs n^3, which is negligible for n = 1000.
	def basic_cholesky(A):
	for A00, A01, A10, A11 in blocks(A):
	A00[:] = np.sqrt(A00)
	A01[:] = 0
	A10[:] /= A00
	A11[:] -= A10 @ A10.T

	# 75 ms for 1000x1000 matrix (~12.5 GFLOPS) versus 25 ms (~40 GFLOPS) for np.linalg.cholesky which uses LAPACK.
	def blocked_cholesky(A, block_size=(64, 64)):
	for A00, A01, A10, A11 in blocks(A, block_size):
	basic_cholesky(A00)
	A01[:] = 0
	lower_triangular_solve(A00, A10.T)
	A11[:] -= A10 @ A10.T