Last active
January 7, 2021 11:10
-
-
Save pervognsen/2af7246113929b48f5fd898546b5289f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def blocks(A, block_size=(1, 1)): | |
i, j = block_size | |
while len(A): | |
yield A[:i, :j], A[:i, j:], A[i:, :j], A[i:, j:] | |
A = A[i:, j:] | |
# 2400 ms for 1000x1000 matrix (~0.4 GFLOPS). Equivalent C code is only twice as fast (~0.8 GFLOPS). | |
# The reason the C code isn't much faster is that it's an O(n^3) algorithm and most of the time is spent in | |
# the O(n^2) kernel routine for the outer product in A11[:] -= A10 @ A10.T. Even if we posit that Python | |
# is 1000x slower than C for the outer loop, that's still 1000n + n^3 vs n^3, which is negligible for n = 1000. | |
def basic_cholesky(A): | |
for A00, A01, A10, A11 in blocks(A): | |
A00[:] = np.sqrt(A00) | |
A01[:] = 0 | |
A10[:] /= A00 | |
A11[:] -= A10 @ A10.T | |
# 75 ms for 1000x1000 matrix (~12.5 GFLOPS) versus 25 ms (~40 GFLOPS) for np.linalg.cholesky which uses LAPACK. | |
def blocked_cholesky(A, block_size=(64, 64)): | |
for A00, A01, A10, A11 in blocks(A, block_size): | |
basic_cholesky(A00) | |
A01[:] = 0 | |
lower_triangular_solve(A00, A10.T) | |
A11[:] -= A10 @ A10.T |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment