Last active
October 10, 2024 17:25
-
-
Save hughdbrown/48719b8a64991719cffe9458011858a2 to your computer and use it in GitHub Desktop.
Test various strategies for creating arrays of random numbers, some with serial dependencies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.environ["NUMBA_LOOP_VECTORIZE"] = "0" | |
os.environ["NUMBA_SLP_VECTORIZE"] = "0" | |
from datetime import datetime | |
from numba import jit, uint32, uint64 | |
import numpy as np | |
@jit("uint32(uint32)") | |
def lcg(seed): | |
temp = uint64(seed) * 2862933555777941757 + 3037000493 | |
return temp >> 32 | |
@jit | |
def generate_random_numbers(n): | |
result = np.empty((n,), dtype=np.uint32) | |
result[0] = uint32(1) | |
for i in range(1, n): | |
# π result[i] has a data dependency on result[i - 1], preventing | |
# instruction-level parallelism: | |
result[i] = lcg(result[i - 1]) | |
return result | |
def to_image(f): | |
return f(256 * 256 // 4).view(np.uint8).reshape((256, 256)) | |
@jit | |
def generate_not_so_random_numbers(n): | |
result = np.empty((n,), dtype=np.uint32) | |
for i in range(1, n): | |
# π€ As a temporary experiment, remove all data dependencies that are | |
# preventing ILP, allowing us to see how much faster we can go: | |
result[i] = lcg(i) | |
return result | |
@jit | |
def generate_random_numbers_3(n): | |
result = np.empty((n,), dtype=np.uint32) | |
# Make sure we have 4 sufficiently different starting points: | |
result[0] = uint32(1) | |
result[1] = lcg(lcg(result[0]) + 1) | |
result[2] = lcg(lcg(result[1]) + 1) | |
result[3] = lcg(lcg(result[2]) + 1) | |
# π Do 4 unrelated calculations that don't share data dependencies with | |
# each other, allowing the CPU to run them in parallel: | |
for i in range(1, n // 4): | |
result[i * 4 ] = lcg(result[(i - 1) * 4 ]) | |
result[i * 4 + 1] = lcg(result[(i - 1) * 4 + 1]) | |
result[i * 4 + 2] = lcg(result[(i - 1) * 4 + 2]) | |
result[i * 4 + 3] = lcg(result[(i - 1) * 4 + 3]) | |
# Calculate the remaining last few values: | |
for i in range(n % 4): | |
result[n - (3 - i)] = lcg(result[n - (3 - i) - 1]) | |
return result | |
@jit | |
def generate_random_numbers_4(n): | |
result = np.empty((n,), dtype=np.uint32) | |
# π Use simple variables to make it faster to access the previous value: | |
random_number1 = uint32(1) | |
random_number2 = lcg(lcg(random_number1) + 1) | |
random_number3 = lcg(lcg(random_number2) + 1) | |
random_number4 = lcg(lcg(random_number3) + 1) | |
for i in range(n // 4): | |
random_number1 = lcg(random_number1) | |
random_number2 = lcg(random_number2) | |
random_number3 = lcg(random_number3) | |
random_number4 = lcg(random_number4) | |
# π Less math, no more array reads: | |
result[i * 4 ] = random_number1 | |
result[i * 4 + 1] = random_number2 | |
result[i * 4 + 2] = random_number3 | |
result[i * 4 + 3] = random_number4 | |
for i in range(n % 4): | |
result[n - (3 - i)] = lcg(result[n - (3 - i) - 1]) | |
return result | |
def main(): | |
fns = [ | |
generate_random_numbers, | |
generate_not_so_random_numbers, | |
generate_random_numbers_3, | |
generate_random_numbers_4, | |
] | |
for fn in fns: | |
print(f"{'-' * 30} {fn.__name__}") | |
for _ in range(10): | |
s = datetime.now() | |
to_image(fn) | |
e = datetime.now() | |
print(f"Elapsed {e - s}") | |
if __name__ == '__main__': | |
main() |
So those final results match the text I think? First is slowest, second is fast (but wrong), third is correct (but in the middle speedwise), fourth is fast again.
Yes, much more like what the text says.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is one of at least two comments that will result in me moving whole chapters around π