Last active
August 23, 2016 03:40
-
-
Save kanzure/2fa531afaf03fddd6568eb0212ac8c4c to your computer and use it in GitHub Desktop.
Find the most recent common block between two different, partially synchronized blockchain data stores. Attempt to do so without calling the _batch RPC call for 500,000 getblockhash requests. See https://botbot.me/freenode/bitcoin-core-dev/2016-04-28/?msg=65077020&page=3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Toy demo for finding the most recent common block between a database and a | |
bitcoind instance. The goal is to correctly handle reorgs and long periods of | |
downtime if any. The previous implementation used getblockhashes and the _batch | |
RPC call to get 300,000 blockhashes at a time. Unfortunately bitcoind is not | |
really made to handle requests of that size. Instead, here is a more optimized | |
implementation that should take less time and a minimal number of RPC requests. | |
The output of this is fed into a planner which generates a plan for processing | |
the blocks between a current blockhash and the last most recent common block. | |
""" | |
import hashlib | |
import unittest | |
import math | |
from copy import deepcopy | |
# ---- utility functions ---- | |
def factors(n): | |
step = 2 if n%2 else 1 | |
return set( | |
reduce( | |
list.__add__, | |
([i, n//i] for i in range(1, int(math.sqrt(n))+1, step) if n % i == 0), | |
) | |
) | |
def sha256(value): | |
return hashlib.sha256(value).hexdigest() | |
def certainlynotsha256(value): | |
return value | |
def generate_many_hashes(hashcount, exclude_first_n=0): | |
""" | |
Make a list of hashes with hashcount-many items. Use exclude_first_n to | |
skip those hashes while still counting them towards the total despite their | |
lack of appearance. | |
""" | |
return [certainlynotsha256(str(counter)) for counter in xrange(exclude_first_n, hashcount)] | |
def generate_hash_dictionary(hashcount, exclude_first_n=0): | |
""" | |
Make a dictionary of shape {counter: hash}. | |
""" | |
some_dict = {} | |
hashes = generate_many_hashes(hashcount, exclude_first_n=exclude_first_n) | |
for (counter, some_hash) in enumerate(hashes): | |
some_dict[counter + exclude_first_n] = some_hash | |
# NOTE: This demo assumes that some_dict.keys() is sorted by the | |
# construction given above. | |
return some_dict | |
# ---- classes ---- | |
class BlockchainStorage(object): | |
def __init__(self, hashdict={}): | |
self.hashdict = hashdict | |
self.hashes = hashdict.values() or [] | |
@property | |
def height(self): | |
return len(self.hashes) | |
class Database(BlockchainStorage): | |
pass | |
class Bitcoind(BlockchainStorage): | |
pass | |
# ---- exceptions ---- | |
class SearchException(Exception): | |
""" | |
General exception for the search algorithm. | |
""" | |
pass | |
class NoDataException(SearchException): | |
""" | |
No data provided. | |
""" | |
pass | |
# ---- search ---- | |
def _initial_checks(lagstore, truthstore): | |
""" | |
Check the lagstore and truthstore for some basic details. These checks | |
should not be performed on each iteration of the search function. | |
""" | |
# TODO: Modify this function to work with a given "current_blockhash". That | |
# is, find the most recent common block given "current_blockhash". Both | |
# data stores might have more hashes than "current_blockhash". | |
# The truthstore must be populated. | |
if len(truthstore.hashes) == 0: | |
raise NoDataException("truthstore has no data") | |
# When there are no hashes in the lagstore, then we know that there are no | |
# matching recent blocks. | |
if len(lagstore.hashes) == 0: | |
return None | |
# When the first hash in the lagstore doesn't match anything in the | |
# truthstore, then we know that the truthstore is not done synchronizing or | |
# the lagstore truly doesn't have anything that matches. | |
if lagstore.hashdict[lagstore.hashdict.keys()[0]] not in truthstore.hashdict.values(): | |
return None | |
# When the latest blocks between the two storages are matching, return that | |
# matching blockhash. | |
if truthstore.hashdict[truthstore.hashdict.keys()[-1]] == lagstore.hashdict[lagstore.hashdict.keys()[-1]]: | |
return (truthstore.hashdict.keys()[-1], truthstore.hashdict[truthstore.hashdict.keys()[-1]]) | |
# Both stores have a "block height" associated with each block. If the two | |
# stores have a different "height" (or rather, if each of their most recent | |
# blocks have a different "height"), then pick the lower height value to be | |
# the number from which to start the search from. | |
reversing_offset = 1 | |
most_recent_lagstore_block_height = lagstore.hashdict.keys()[0 - reversing_offset] | |
most_recent_truthstore_block_height = truthstore.hashdict.keys()[0 - reversing_offset] | |
block_height_search_start = min(most_recent_lagstore_block_height, most_recent_truthstore_block_height) | |
# Do both stores have the same hash at that height? By definition at least | |
# one of the two stores does not have any data beyond this point, therefore | |
# this would be the most recent common block. | |
if lagstore.hashdict[block_height_search_start] == truthstore.hashdict[block_height_search_start]: | |
return (block_height_search_start, truthstore.hashdict[block_height_search_start]) | |
# Next, find the oldest common block between the two data stores. This by | |
# definition should be the oldest block in the lagstore. If the lagstore's | |
# initial block is not in the truthstore, then the function would have | |
# exited above and returned None. | |
oldest_common_block_height = lagstore.hashdict.keys()[0] | |
return ("OK", | |
{ | |
"block_height_search_start": block_height_search_start, | |
"oldest_common_block_height": oldest_common_block_height, | |
}, | |
) | |
def find_most_recent_common_block(lagstore, truthstore, skip_initial_checks=False, block_height_search_start=None, oldest_common_block_height=None, depth=0, verbose_debug=True): | |
""" | |
Find the most recent common block shared by both storage devices. The | |
second storage is assumed to be the source of truth. | |
When the lagstore is empty, this function returns None to indicate that | |
there are no matching blocks between the two data storage devices. | |
""" | |
# TODO: implement something about "block processor minimum height locking" | |
# here? | |
# some trivial parameter validation checking | |
if skip_initial_checks == True: | |
if block_height_search_start == None: | |
raise Exception("invalid value for block_height_search_start") | |
elif oldest_common_block_height == None: | |
raise Exception("invalid value for oldest_common_block_height") | |
# perform initial checks if allowed by flag | |
if not skip_initial_checks: | |
result = _initial_checks(lagstore, truthstore) | |
if type(result) != tuple or (type(result) == tuple and result[0] != "OK"): | |
return result | |
block_height_search_start = result[1]["block_height_search_start"] | |
oldest_common_block_height = result[1]["oldest_common_block_height"] | |
# next iteration, skip those checks | |
skip_initial_checks = True | |
potential_blocks = block_height_search_start - oldest_common_block_height | |
# When there's only 100 blocks (or less), give up on the search algorithm | |
# and switch to using getblockhashes RPC calls. | |
if potential_blocks <= 100: | |
if verbose_debug: | |
print( | |
"-- Manually check {} blockhashes between {oldest_common_block_height} and {block_height_search_start}".format( | |
potential_blocks, | |
oldest_common_block_height=oldest_common_block_height, | |
block_height_search_start=block_height_search_start, | |
) | |
) | |
# Manually fetch all the hashes of interest. | |
target_range = range(oldest_common_block_height, block_height_search_start + 1) | |
lagstorehashes = {index: lagstore.hashdict[index] for index in target_range} | |
truthstorehashes = {index: truthstore.hashdict[index] for index in target_range} | |
# find the most recent common hash | |
for index in reversed(truthstorehashes.keys()): | |
lagstorehash = lagstore.hashdict[index] | |
truthstorehash = truthstore.hashdict[index] | |
if truthstorehash == lagstorehash: | |
break | |
else: | |
raise Exception("didn't find a matching hash; this should be impossible in this spot...") | |
return (index, truthstorehash) | |
# More than 100 blocks to go. Continue using the search algorithm. | |
else: | |
# how many blocks to sample in the range at a time? | |
num_points = 10 | |
# Split the remaining potential blocks into different groups to check. | |
# This is not quite a binary search. We can request multiple blocks at | |
# a time from the truthstore. | |
points = range( | |
oldest_common_block_height, | |
oldest_common_block_height + potential_blocks, | |
potential_blocks // num_points, | |
) | |
points.append(oldest_common_block_height + potential_blocks) | |
# Use sort because the initial element in the "points" array is the | |
# highest value, and it should be placed at the end of the array. | |
# Use reversed because it's better to learn about commonality near the | |
# end of the blockchains rather than at the beginning. | |
points = list(reversed(sorted(points))) | |
# TODO: what should the initial value of last_point be? | |
last_point = None | |
# Request all of these points from the truthstore at the same time | |
# using a single RPC request. | |
truthstore_hash_values = {} | |
for point in points: | |
truthstore_hash_values[point] = truthstore.hashdict[point] | |
if truthstore_hash_values[point] in lagstore.hashdict.values(): | |
break | |
last_point = point | |
else: | |
raise Exception("no matching values found.. this should not happen? (double check)") | |
if verbose_debug: | |
print("points are: {}".format(points)) | |
print("point is: {}".format(point)) | |
print("last_point is: {}".format(last_point)) | |
print("block_height_search_start: {}".format(block_height_search_start)) | |
print("oldest_common_block_height: {}".format(oldest_common_block_height)) | |
print("potential_blocks: {}".format(potential_blocks)) | |
print("depth: {}".format(depth)) | |
# this is here to prevent a max recursion depth error, although | |
# this should not be required if the implementation is working | |
# correctly. | |
if depth > 4: | |
raise Exception("recursion depth exceeded, depth is {}".format(depth)) | |
return find_most_recent_common_block( | |
lagstore, | |
truthstore, | |
skip_initial_checks=skip_initial_checks, | |
block_height_search_start=last_point, | |
oldest_common_block_height=point, | |
depth=depth+1, | |
debug=debug, | |
) | |
# ---- test setup functions ---- | |
def make_shared_stores(hashcount): | |
""" | |
Make two separate data stores with hashcount-many hashes. | |
""" | |
hashdict = generate_hash_dictionary(hashcount=hashcount) | |
allhashes = hashdict.values() | |
database = Database(hashdict=deepcopy(hashdict)) | |
bitcoind = Bitcoind(hashdict=deepcopy(hashdict)) | |
return (allhashes, hashdict, database, bitcoind) | |
# ---- tests ---- | |
class SomeTestCase(unittest.TestCase): | |
def test_find_most_recent_common_block__no_truthstore_data(self): | |
hashcount = 10 | |
hashdict = generate_hash_dictionary(hashcount=hashcount) | |
database = Database(hashdict=hashdict) | |
bitcoind = Bitcoind(hashdict={}) | |
with self.assertRaises(NoDataException): | |
find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
def test_find_most_recent_common_block__empty_lagstore(self): | |
hashcount = 10 | |
hashdict = generate_hash_dictionary(hashcount=hashcount) | |
database = Database(hashdict={}) | |
bitcoind = Bitcoind(hashdict=hashdict) | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
# There should be no "most recent common block" between the two stores. | |
self.assertEqual(mrcb, None) | |
def test_find_most_recent_common_block__shares_last_block(self): | |
hashcount = 10 | |
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount) | |
# execute | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
self.assertNotEqual(mrcb, None) | |
self.assertEqual(mrcb, (hashcount - 1, allhashes[-1])) | |
def test_find_most_recent_common_block__mismatched_heights_but_same_hash_at_mismatch(self): | |
hashcount = 10 | |
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount) | |
# make a size mismatch | |
del database.hashdict[hashcount - 1] | |
self.assertNotEqual(database.hashdict.values(), bitcoind.hashdict.values()) | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
# hashcount - 2 because index 8 is the first matching element | |
self.assertEqual(mrcb, (hashcount - 2, allhashes[-2])) | |
def test_find_most_recent_common_block__nothing_in_common_by_lagstore_first_blockhash(self): | |
hashcount = 10 | |
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount) | |
database.hashdict[0] = "xyz" | |
# The remainder of the hashes in the database don't matter. In fact, it | |
# would be wrong for the database to keep its original later hashes | |
# after that 0's hash value, because those hashes would be wrong and | |
# impossible (a blockchain is a hashchain, after all). | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
self.assertEqual(mrcb, None) | |
def test_find_most_recent_common_block__two_missing(self): | |
hashcount = 20 | |
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount) | |
del database.hashdict[database.hashdict.keys()[-1]] | |
del database.hashdict[database.hashdict.keys()[-1]] | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
self.assertEqual(mrcb, (database.hashdict.keys()[hashcount - 3], database.hashdict[database.hashdict.keys()[hashcount - 3]])) | |
def test_find_most_recent_common_block__two_different(self): | |
hashcount = 20 | |
(allhashes, hashdict, database, bitcoind) = make_shared_stores(hashcount) | |
del database.hashdict[database.hashdict.keys()[-1]] | |
del database.hashdict[database.hashdict.keys()[-1]] | |
database.hashdict[18] = "xyz" | |
database.hashdict[19] = "abc" | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
self.assertEqual(mrcb, (database.hashdict.keys()[hashcount - 3], database.hashdict[database.hashdict.keys()[hashcount - 3]])) | |
# This tests at least one level of recursion of the search function. | |
def test_find_most_recent_common_block__lotta_blocks_to_check(self): | |
hashcount = 400 | |
hashdict = generate_hash_dictionary(hashcount=hashcount) | |
bitcoind = Bitcoind(hashdict=deepcopy(hashdict)) | |
other_total_hashcount = 400 | |
other_different_hashcount = 200 | |
other_same_hashcount = 200 | |
other_hashdict = generate_hash_dictionary(hashcount=other_same_hashcount) | |
for x in range(other_same_hashcount, other_total_hashcount + 1): | |
other_hashdict[x] = certainlynotsha256("other {}".format(x)) | |
database = Database(hashdict=other_hashdict) | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
expected_height = (hashcount / 2) - 1 | |
expected_hash = hashdict[expected_height] | |
self.assertEqual(mrcb, (expected_height, expected_hash)) | |
def test_find_most_recent_common_block__many_many_differet(self): | |
hashcount = 25000 | |
hashdict = generate_hash_dictionary(hashcount=hashcount) | |
bitcoind = Bitcoind(hashdict=deepcopy(hashdict)) | |
other_total_hashcount = hashcount + 1800 | |
other_different_hashcount = other_total_hashcount - hashcount | |
other_same_hashcount = hashcount / 2 | |
other_hashdict = generate_hash_dictionary(hashcount=other_same_hashcount) | |
for x in range(other_same_hashcount, other_total_hashcount + 1): | |
other_hashdict[x] = certainlynotsha256("other {}".format(x)) | |
database = Database(hashdict=other_hashdict) | |
mrcb = find_most_recent_common_block(lagstore=database, truthstore=bitcoind) | |
expected_height = (hashcount / 2) - 1 | |
expected_hash = hashdict[expected_height] | |
self.assertEqual(mrcb, (expected_height, expected_hash)) | |
if __name__ == "__main__": | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment