-
-
Save hughdbrown/48719b8a64991719cffe9458011858a2 to your computer and use it in GitHub Desktop.
import os | |
os.environ["NUMBA_LOOP_VECTORIZE"] = "0" | |
os.environ["NUMBA_SLP_VECTORIZE"] = "0" | |
from datetime import datetime | |
from numba import jit, uint32, uint64 | |
import numpy as np | |
@jit("uint32(uint32)") | |
def lcg(seed): | |
temp = uint64(seed) * 2862933555777941757 + 3037000493 | |
return temp >> 32 | |
@jit | |
def generate_random_numbers(n): | |
result = np.empty((n,), dtype=np.uint32) | |
result[0] = uint32(1) | |
for i in range(1, n): | |
# 🙁 result[i] has a data dependency on result[i - 1], preventing | |
# instruction-level parallelism: | |
result[i] = lcg(result[i - 1]) | |
return result | |
def to_image(f): | |
return f(256 * 256 // 4).view(np.uint8).reshape((256, 256)) | |
@jit | |
def generate_not_so_random_numbers(n): | |
result = np.empty((n,), dtype=np.uint32) | |
for i in range(1, n): | |
# 🤔 As a temporary experiment, remove all data dependencies that are | |
# preventing ILP, allowing us to see how much faster we can go: | |
result[i] = lcg(i) | |
return result | |
@jit | |
def generate_random_numbers_3(n): | |
result = np.empty((n,), dtype=np.uint32) | |
# Make sure we have 4 sufficiently different starting points: | |
result[0] = uint32(1) | |
result[1] = lcg(lcg(result[0]) + 1) | |
result[2] = lcg(lcg(result[1]) + 1) | |
result[3] = lcg(lcg(result[2]) + 1) | |
# 😎 Do 4 unrelated calculations that don't share data dependencies with | |
# each other, allowing the CPU to run them in parallel: | |
for i in range(1, n // 4): | |
result[i * 4 ] = lcg(result[(i - 1) * 4 ]) | |
result[i * 4 + 1] = lcg(result[(i - 1) * 4 + 1]) | |
result[i * 4 + 2] = lcg(result[(i - 1) * 4 + 2]) | |
result[i * 4 + 3] = lcg(result[(i - 1) * 4 + 3]) | |
# Calculate the remaining last few values: | |
for i in range(n % 4): | |
result[n - (3 - i)] = lcg(result[n - (3 - i) - 1]) | |
return result | |
@jit | |
def generate_random_numbers_4(n): | |
result = np.empty((n,), dtype=np.uint32) | |
# 😎 Use simple variables to make it faster to access the previous value: | |
random_number1 = uint32(1) | |
random_number2 = lcg(lcg(random_number1) + 1) | |
random_number3 = lcg(lcg(random_number2) + 1) | |
random_number4 = lcg(lcg(random_number3) + 1) | |
for i in range(n // 4): | |
random_number1 = lcg(random_number1) | |
random_number2 = lcg(random_number2) | |
random_number3 = lcg(random_number3) | |
random_number4 = lcg(random_number4) | |
# 😎 Less math, no more array reads: | |
result[i * 4 ] = random_number1 | |
result[i * 4 + 1] = random_number2 | |
result[i * 4 + 2] = random_number3 | |
result[i * 4 + 3] = random_number4 | |
for i in range(n % 4): | |
result[n - (3 - i)] = lcg(result[n - (3 - i) - 1]) | |
return result | |
def main(): | |
fns = [ | |
generate_random_numbers, | |
generate_not_so_random_numbers, | |
generate_random_numbers_3, | |
generate_random_numbers_4, | |
] | |
for fn in fns: | |
print(f"{'-' * 30} {fn.__name__}") | |
for _ in range(10): | |
s = datetime.now() | |
to_image(fn) | |
e = datetime.now() | |
print(f"Elapsed {e - s}") | |
if __name__ == '__main__': | |
main() |
hughdbrown
commented
Oct 9, 2024
The first time you call a Numba function with a specific set of types, it gets compiled, so that adds a lot of overhead. You want to measure the time on the second (or later) time it's called.
With multiple iterations, to warm up the code generation:
~/Downloads ❯ python3 ~/scratch-code/random_number.py 5s scratch-3.12.0 3.12.0 11:20:57
i------------------------------ generate_random_numbers
Elapsed 0:00:00.060160
Elapsed 0:00:00.000025
Elapsed 0:00:00.000024
Elapsed 0:00:00.000021
Elapsed 0:00:00.000024
Elapsed 0:00:00.000024
Elapsed 0:00:00.000019
Elapsed 0:00:00.000019
Elapsed 0:00:00.000025
Elapsed 0:00:00.000024
i------------------------------ generate_not_so_random_numbers
Elapsed 0:00:00.025977
Elapsed 0:00:00.000010
Elapsed 0:00:00.000007
Elapsed 0:00:00.000008
Elapsed 0:00:00.000008
Elapsed 0:00:00.000008
Elapsed 0:00:00.000007
Elapsed 0:00:00.000007
Elapsed 0:00:00.000006
Elapsed 0:00:00.000006
i------------------------------ generate_random_numbers_3
Elapsed 0:00:00.061300
Elapsed 0:00:00.000018
Elapsed 0:00:00.000015
Elapsed 0:00:00.000017
Elapsed 0:00:00.000013
Elapsed 0:00:00.000012
Elapsed 0:00:00.000017
Elapsed 0:00:00.000016
Elapsed 0:00:00.000017
Elapsed 0:00:00.000015
i------------------------------ generate_random_numbers_4
Elapsed 0:00:00.057804
Elapsed 0:00:00.000010
Elapsed 0:00:00.000007
Elapsed 0:00:00.000008
Elapsed 0:00:00.000007
Elapsed 0:00:00.000007
Elapsed 0:00:00.000006
Elapsed 0:00:00.000006
Elapsed 0:00:00.000006
Elapsed 0:00:00.000006
This is one of at least two comments that will result in me moving whole chapters around 😁
So those final results match the text I think? First is slowest, second is fast (but wrong), third is correct (but in the middle speedwise), fourth is fast again.
Yes, much more like what the text says.