Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Last active October 10, 2024 17:25
Show Gist options
  • Save hughdbrown/48719b8a64991719cffe9458011858a2 to your computer and use it in GitHub Desktop.
Save hughdbrown/48719b8a64991719cffe9458011858a2 to your computer and use it in GitHub Desktop.
Test various strategies for creating arrays of random numbers, some with serial dependencies
import os
os.environ["NUMBA_LOOP_VECTORIZE"] = "0"
os.environ["NUMBA_SLP_VECTORIZE"] = "0"
from datetime import datetime
from numba import jit, uint32, uint64
import numpy as np
@jit("uint32(uint32)")
def lcg(seed):
temp = uint64(seed) * 2862933555777941757 + 3037000493
return temp >> 32
@jit
def generate_random_numbers(n):
result = np.empty((n,), dtype=np.uint32)
result[0] = uint32(1)
for i in range(1, n):
# ๐Ÿ™ result[i] has a data dependency on result[i - 1], preventing
# instruction-level parallelism:
result[i] = lcg(result[i - 1])
return result
def to_image(f):
return f(256 * 256 // 4).view(np.uint8).reshape((256, 256))
@jit
def generate_not_so_random_numbers(n):
result = np.empty((n,), dtype=np.uint32)
for i in range(1, n):
# ๐Ÿค” As a temporary experiment, remove all data dependencies that are
# preventing ILP, allowing us to see how much faster we can go:
result[i] = lcg(i)
return result
@jit
def generate_random_numbers_3(n):
result = np.empty((n,), dtype=np.uint32)
# Make sure we have 4 sufficiently different starting points:
result[0] = uint32(1)
result[1] = lcg(lcg(result[0]) + 1)
result[2] = lcg(lcg(result[1]) + 1)
result[3] = lcg(lcg(result[2]) + 1)
# ๐Ÿ˜Ž Do 4 unrelated calculations that don't share data dependencies with
# each other, allowing the CPU to run them in parallel:
for i in range(1, n // 4):
result[i * 4 ] = lcg(result[(i - 1) * 4 ])
result[i * 4 + 1] = lcg(result[(i - 1) * 4 + 1])
result[i * 4 + 2] = lcg(result[(i - 1) * 4 + 2])
result[i * 4 + 3] = lcg(result[(i - 1) * 4 + 3])
# Calculate the remaining last few values:
for i in range(n % 4):
result[n - (3 - i)] = lcg(result[n - (3 - i) - 1])
return result
@jit
def generate_random_numbers_4(n):
result = np.empty((n,), dtype=np.uint32)
# ๐Ÿ˜Ž Use simple variables to make it faster to access the previous value:
random_number1 = uint32(1)
random_number2 = lcg(lcg(random_number1) + 1)
random_number3 = lcg(lcg(random_number2) + 1)
random_number4 = lcg(lcg(random_number3) + 1)
for i in range(n // 4):
random_number1 = lcg(random_number1)
random_number2 = lcg(random_number2)
random_number3 = lcg(random_number3)
random_number4 = lcg(random_number4)
# ๐Ÿ˜Ž Less math, no more array reads:
result[i * 4 ] = random_number1
result[i * 4 + 1] = random_number2
result[i * 4 + 2] = random_number3
result[i * 4 + 3] = random_number4
for i in range(n % 4):
result[n - (3 - i)] = lcg(result[n - (3 - i) - 1])
return result
def main():
fns = [
generate_random_numbers,
generate_not_so_random_numbers,
generate_random_numbers_3,
generate_random_numbers_4,
]
for fn in fns:
print(f"{'-' * 30} {fn.__name__}")
for _ in range(10):
s = datetime.now()
to_image(fn)
e = datetime.now()
print(f"Elapsed {e - s}")
if __name__ == '__main__':
main()
@hughdbrown
Copy link
Author

Yes, much more like what the text says.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment