Last active
October 23, 2024 18:59
-
-
Save rwat/83a5e8a8a77908e6f7ef9626fbde6108 to your computer and use it in GitHub Desktop.
A geographic bounding box search PoC that uses a disk-backed sparse array.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import logging | |
import random | |
import lmdb | |
import msgpack | |
WEST = 0 | |
SOUTH = 1 | |
EAST = 2 | |
NORTH = 3 | |
GRID_SIZE = 0.1 # in degrees latitude and longitude | |
DB_NAME = 'sample_db' | |
MAP_SIZE = 1024 * 1024 * 1024 * 10 # 10 GB | |
kv_options = { | |
'buffers': True, | |
'map_async': True, | |
'meminit': False, | |
'metasync': False, | |
'sync': False, | |
'writemap': True, | |
} | |
logger = logging.getLogger('') | |
logger.setLevel(logging.DEBUG) | |
console = logging.StreamHandler() | |
console.setLevel(logging.DEBUG) | |
logging.getLogger('').addHandler(console) | |
def kv_put(obj, k, v): | |
obj.put(msgpack.packb(k), msgpack.packb(v)) | |
def kv_get(obj, k): | |
v = obj.get(msgpack.packb(k)) | |
if v: | |
return msgpack.unpackb(v) | |
else: | |
return None | |
def grid_to_key(x, y): | |
return f'grid:{x}:{y}' | |
def point_bbox_intersection(curs, point): | |
lon, lat = point | |
lon_x = int(lon / GRID_SIZE) | |
lat_y = int(lat / GRID_SIZE) | |
if grid_value := kv_get(curs, grid_to_key(lon_x, lat_y)): | |
results = [] | |
for entry_key in grid_value: | |
entry_value = kv_get(curs, entry_key) | |
if (lon <= entry_value['bbox'][EAST] and lon >= entry_value['bbox'][WEST] and | |
lon <= entry_value['bbox'][NORTH] and lat >= entry_value['bbox'][SOUTH]): | |
results.append(entry_key) | |
return results if results else None | |
else: | |
return None | |
def benchmark(points): | |
logger.info('Benchmark: start') | |
kv = lmdb.open(DB_NAME, map_size=MAP_SIZE) | |
with kv.begin(buffers=True) as txn: | |
with txn.cursor() as curs: | |
start_time = datetime.datetime.now() | |
for i, point in enumerate(points): | |
result = point_bbox_intersection(curs, point) | |
# NOTE: the 'result' contains the source data indices for points that intersect the | |
# bounding box of source items, so you could retrieve the matching source data with | |
# kv_get(curs, result[i]) where 0 < i < len(result) | |
# Things like polygon lookup and polygon intersection could be done from there. | |
logger.info(f'Benchmark: done in {datetime.datetime.now() - start_time} seconds') | |
def create_user_input(num_points): | |
points = [] | |
logger.info('Creating fake user data') | |
for _ in range(num_points): | |
lon = (random.random() * 180) - 90 | |
lat = (random.random() * 360) - 180 | |
points.append((lon, lat)) | |
return points | |
def generate_random_bbox(): | |
'''Create a bounding box up to 0.5 degrees in size''' | |
north = (random.random() * 178) - 89 # latitude range of -89 to 89 degrees | |
east = (random.random() * 358) - 179 # longitude range of -179 to 179 | |
south = north - (random.random() * 0.5) | |
west = east - (random.random() * 0.5) | |
# return the bounding box in CMR search order of ll, ur in (lon, lat) order | |
return (west, south, east, north) | |
def initialize_fake_data(num_entries): | |
logger.info('Placing generated entries in database.') | |
sparse_map_actions = 0 | |
kv = lmdb.open(DB_NAME, map_size=MAP_SIZE) | |
with kv.begin(write=True, buffers=True) as txn: | |
with txn.cursor() as curs: | |
for entry_key in range(num_entries): | |
# keyspace for source data entries are integers | |
bbox = generate_random_bbox() | |
fake_description = 'String data, maybe include a path pointing to original data.' | |
entry = { | |
'bbox': bbox, | |
'description': fake_description | |
# Could optionally have the whole polygon in here too and test intersection | |
# with shapely. | |
} | |
kv_put(curs, entry_key, entry) | |
# keyspace for sparse grid is 'grid:x:y' where 'x' is longitude and 'y' is latitude | |
min_x = int(bbox[WEST] / GRID_SIZE) | |
max_x = int(bbox[EAST] / GRID_SIZE) | |
min_y = int(bbox[SOUTH] / GRID_SIZE) | |
max_y = int(bbox[NORTH] / GRID_SIZE) | |
for x in range(min_x, max_x + 1): | |
for y in range(min_y, max_y + 1): | |
grid_key = grid_to_key(x, y) | |
if v := kv_get(curs, grid_key): | |
kv_put(curs, grid_key, v.append(entry_key)) | |
else: | |
kv_put(curs, grid_key, [entry_key]) | |
sparse_map_actions += 2 | |
# log every one million items | |
if entry_key % int(1e5) == 0: | |
logger.info(f'{entry_key} items entered into the database') | |
logger.info(f'{num_entries} entries and {sparse_map_actions} sparse array items entered into' | |
f'the database') | |
if __name__ == '__main__': | |
num_entries = int(1e6 * 16) # sixteen million entries | |
num_points = 1000 # one thousand points | |
# run once and then disable | |
initialize_fake_data(num_entries) | |
# generate points for fake user input | |
points = create_user_input(num_points) | |
# benchmark bounding box tests across fake input | |
benchmark(points) | |
# one example run (will vary due to randomized test data): | |
# 16000000 entries and 391672460 sparse array items entered into the database | |
# database size: 2.4GB | |
# ~0.2 seconds to search on 10000 points (warm cache) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment