Created
October 3, 2023 03:22
-
-
Save josiahcarlson/299c6cfaee33985c2b4aa80a83247cf9 to your computer and use it in GitHub Desktop.
Find all 1-bit differences among provided integers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
one_bit_difference.py | |
Written September / October 2023 by Josiah Carlson - [email protected] | |
Released into the public domain. Please include this authorship note / reference | |
to this GIST if you use / derive this software. Thank you. | |
Been thinking about this algorithm for about 17 years, haven't seen it around. | |
I might not know the proper literature search to find relevant references, or if | |
this was already done. | |
This algorithm finds all 1-bit differences in provided integers in | |
O(num_items * num_bits) time. We directly examine every bit approximately once | |
during our algorithm, may subtract up to num_items of up to num_bits, and may | |
mask a total of num_items for up to num_bits. Overall, this gives us a | |
worst-case constant for potentially large-number bit examinations at 3. For | |
random numbers, and even chosen 1-bit difference groups, we usually see a total | |
of closer to 2.3-2.5 in practice for non-sequential data, as execution constants | |
vary depending on the data bits themselves. | |
Overall Algorithm: | |
1. Insert all items into a prefix trie | |
2. From leaf to root, perform the "find_difference_merge()" function defined | |
below, with leaves having bpos=0, their result having bpos=1, etc. | |
3. your output should contain all pairs of 1-bit difference values | |
find_difference_merge: | |
1. if only one leaf has data, or there is only one leaf, return that one leaf | |
2. Given that zeroes and ones are already individually sorted, up to bpos bits, | |
we can zipper-compare the lowest bpos bits | |
a. if the lower bpos bits are the same, emit the match, advance to the next | |
item in both sequences | |
b. if the zeroes item is less, advance to the next zeroes item | |
c. else the ones item is less, advance to the next ones item | |
^^^ each step in #2 advances at least 1 item in zeroes, ones, or both lists | |
-- we count "comparisons" as subtractions in our metrics, as most platforms get | |
at least == 0 and <0 as easy checks for integer and infinite-precision | |
integer operations. | |
Cost of creating trie: O(num_items * num_bits) bit comparisons | |
Cost of traversing trie from leaf to root: O(num_items * comparisons_per_item) | |
comparisons_per_item <= num_bits - 1 | |
In random inputs, sequential inputs, and explicitly generated 1-bit-difference | |
graphs, we see "subtractions_per_bit" never go above .75. | |
* for low-match scenarios, we will see masks / subtractions approach 1 from above | |
* for high-match scenarios, we will see masks / subtractions approach 2 + epsilon | |
from below | |
""" | |
from itertools import count | |
import os | |
# metrics counters | |
check_bit = count() | |
masks = count() | |
subtractions = count() | |
control_flow = count() | |
traverse = count() | |
def find_difference_merge(output: list, zeroes: list, ones: list, bpos: int) -> list: | |
""" | |
Args: | |
output - list to produce pairs of 1-bit differences | |
zeroes - list of items with a 0 in the bpos bit position | |
ones - list of items with a 1 in the bpos bit position | |
bpos - which bit position we are checking | |
Given two lists of sorted items with identical prefixes of n-bpos-1 bits, we | |
are to calculate all pairs of items with 1 total bit of difference. | |
Side-effects: | |
Appends (smaller, larger) pairs of values with 1 bit differences to output, | |
and returns combined list of zeroes + ones. | |
""" | |
next(control_flow) | |
if not zeroes: | |
return ones | |
if not ones: | |
return zeroes | |
zp = 0 | |
op = 0 | |
lz = len(zeroes) | |
lo = len(ones) | |
mask = (1 << bpos) - 1 | |
next(masks) | |
next(masks) | |
zer = zeroes[zp] & mask | |
one = ones[op] & mask | |
while zp < lz and op < lo: | |
next(control_flow) | |
# Really want a 3-way comparison, there's an assembly instruction for that, | |
# or even just I86 cmov I suspect. | |
# print("comparing", bin(zer), zer, bin(one),one, bin(mask)) | |
delta = zer - one | |
next(subtractions) | |
if delta == 0: | |
output.append((zeroes[zp], ones[op])) | |
zp += 1 | |
op += 1 | |
if zp < lz: | |
zer = zeroes[zp] & mask | |
next(masks) | |
if op < lo: | |
one = ones[op] & mask | |
next(masks) | |
next(control_flow) # op < lo and zp < lz | |
elif delta < 0: | |
zp += 1 | |
if zp < lz: | |
zer = zeroes[zp] & mask | |
next(masks) | |
next(control_flow) # zp < lz | |
else: | |
op += 1 | |
if op < lo: | |
one = ones[op] & mask | |
next(masks) | |
next(control_flow) # op < lo | |
next(control_flow) # delta == 0 or delta < 0 | |
next(control_flow) # loop check exit | |
# all zeroes come before all ones | |
return zeroes + ones | |
def construct_trie(items: list, bits: int) -> tuple: | |
""" | |
Args: | |
items - list of items to partition into a bitwise trie | |
bits - how many bits to partition this list of items | |
Returns: | |
Recursively-partitioned trie structure. | |
construct_trie([3,1,2], 2) -> (([], [1]), ([2], [3])) | |
""" | |
mask = 1 << bits | |
zo = [], [] | |
for item in items: | |
# we are literally examining the bits | |
next(check_bit) | |
zo[bool(item & mask)].append(item) | |
return tuple( | |
(construct_trie(zo_i, bits-1) if zo_i and bits else zo_i) for zo_i in zo) | |
def _traverse_trie(output: list, this_part: tuple, bits: int) -> list: | |
if isinstance(this_part, list): | |
return this_part | |
next(traverse) | |
this_part = tuple( | |
(_traverse_trie(output, part, bits - 1) if part else part) for part in this_part | |
) | |
return find_difference_merge(output, this_part[0], this_part[1], bits) | |
def find_difference_caller(items: list, bits: int, metrics=True) -> list: | |
""" | |
Args: | |
items - list of items to compare for 1-bit differences | |
bits - number of bits to compare for differences | |
Returns: | |
[(low, high), ...] for all pairs low and high from items differing in 1 bit. | |
""" | |
if metrics: | |
names = 'check_bit masks subtractions control_flow traverse'.split() | |
gl = globals() | |
for name in names: | |
gl[name] = count() | |
trie = construct_trie(items, bits-1) | |
output = [] | |
_traverse_trie(output, trie, bits-1) | |
if metrics: | |
for name in names: | |
print(name, "per bit", (next(gl[name]) - 1) / len(items) / bits) | |
return output | |
def one_off_seq(count: int, bits: int, step: int=1) -> list: | |
""" | |
Args: | |
count - number of items to return | |
bits - number of bits in each item | |
step - how many bits to step (will check every bit, otherwise) | |
Returns: | |
list of items with as many entries having <bits> 1-bit differences as | |
possible. | |
""" | |
out = some_random_vals(1, bits) | |
known = set(out) | |
i = 0 | |
while len(out) < count: | |
it = out[i] | |
for j in range(0, bits, step): | |
iit = it ^ (1 << ((j + i) % bits)) | |
if iit not in known: | |
known.add(iit) | |
out.append(iit) | |
i += 1 | |
return out | |
def some_random_vals(count: int, bits: int) -> list: | |
""" | |
Args: | |
count - number of items wanted | |
bits - number of bits in each item | |
Returns: | |
List of <count> unique items generated from os.urandom() of <bits> length. | |
""" | |
mask = (2**bits)-1 | |
assert count <= (mask>>1) | |
read = (7 + bits) >> 3 | |
out = set() | |
while len(out) < count: | |
it = int.from_bytes(os.urandom(read), 'big') & mask | |
out.add(it) | |
return list(out) | |
if __name__ == "__main__": | |
bits = 24 | |
for cnt in (512, 4096, 65536): | |
# average case | |
items = some_random_vals(cnt, bits) | |
print("random", len(find_difference_caller(items, bits, True))) | |
# best case | |
# adjacent items are super easy to process, and have a lot of 1-bit | |
# differences | |
items = list(range(cnt)) | |
print("sequential", len(find_difference_caller(items, bits, True))) | |
# these actually have fewer 1-off differences than sequential items, which | |
# is due to our latter construction of items that are 1 bit different from | |
# earlier items, but the earliest items have 1-bit differences from <bits> | |
# items | |
items = one_off_seq(cnt, bits) | |
print("explicit", len(find_difference_caller(items, bits, True))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment