Skip to content

Instantly share code, notes, and snippets.

@rwat
Last active October 23, 2024 18:59
Show Gist options
  • Save rwat/83a5e8a8a77908e6f7ef9626fbde6108 to your computer and use it in GitHub Desktop.
Save rwat/83a5e8a8a77908e6f7ef9626fbde6108 to your computer and use it in GitHub Desktop.
A geographic bounding box search PoC that uses a disk-backed sparse array.
import datetime
import logging
import random
import lmdb
import msgpack
WEST = 0
SOUTH = 1
EAST = 2
NORTH = 3
GRID_SIZE = 0.1 # in degrees latitude and longitude
DB_NAME = 'sample_db'
MAP_SIZE = 1024 * 1024 * 1024 * 10 # 10 GB
kv_options = {
'buffers': True,
'map_async': True,
'meminit': False,
'metasync': False,
'sync': False,
'writemap': True,
}
logger = logging.getLogger('')
logger.setLevel(logging.DEBUG)
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
logging.getLogger('').addHandler(console)
def kv_put(obj, k, v):
obj.put(msgpack.packb(k), msgpack.packb(v))
def kv_get(obj, k):
v = obj.get(msgpack.packb(k))
if v:
return msgpack.unpackb(v)
else:
return None
def grid_to_key(x, y):
return f'grid:{x}:{y}'
def point_bbox_intersection(curs, point):
lon, lat = point
lon_x = int(lon / GRID_SIZE)
lat_y = int(lat / GRID_SIZE)
if grid_value := kv_get(curs, grid_to_key(lon_x, lat_y)):
results = []
for entry_key in grid_value:
entry_value = kv_get(curs, entry_key)
if (lon <= entry_value['bbox'][EAST] and lon >= entry_value['bbox'][WEST] and
lon <= entry_value['bbox'][NORTH] and lat >= entry_value['bbox'][SOUTH]):
results.append(entry_key)
return results if results else None
else:
return None
def benchmark(points):
logger.info('Benchmark: start')
kv = lmdb.open(DB_NAME, map_size=MAP_SIZE)
with kv.begin(buffers=True) as txn:
with txn.cursor() as curs:
start_time = datetime.datetime.now()
for i, point in enumerate(points):
result = point_bbox_intersection(curs, point)
# NOTE: the 'result' contains the source data indices for points that intersect the
# bounding box of source items, so you could retrieve the matching source data with
# kv_get(curs, result[i]) where 0 < i < len(result)
# Things like polygon lookup and polygon intersection could be done from there.
logger.info(f'Benchmark: done in {datetime.datetime.now() - start_time} seconds')
def create_user_input(num_points):
points = []
logger.info('Creating fake user data')
for _ in range(num_points):
lon = (random.random() * 180) - 90
lat = (random.random() * 360) - 180
points.append((lon, lat))
return points
def generate_random_bbox():
'''Create a bounding box up to 0.5 degrees in size'''
north = (random.random() * 178) - 89 # latitude range of -89 to 89 degrees
east = (random.random() * 358) - 179 # longitude range of -179 to 179
south = north - (random.random() * 0.5)
west = east - (random.random() * 0.5)
# return the bounding box in CMR search order of ll, ur in (lon, lat) order
return (west, south, east, north)
def initialize_fake_data(num_entries):
logger.info('Placing generated entries in database.')
sparse_map_actions = 0
kv = lmdb.open(DB_NAME, map_size=MAP_SIZE)
with kv.begin(write=True, buffers=True) as txn:
with txn.cursor() as curs:
for entry_key in range(num_entries):
# keyspace for source data entries are integers
bbox = generate_random_bbox()
fake_description = 'String data, maybe include a path pointing to original data.'
entry = {
'bbox': bbox,
'description': fake_description
# Could optionally have the whole polygon in here too and test intersection
# with shapely.
}
kv_put(curs, entry_key, entry)
# keyspace for sparse grid is 'grid:x:y' where 'x' is longitude and 'y' is latitude
min_x = int(bbox[WEST] / GRID_SIZE)
max_x = int(bbox[EAST] / GRID_SIZE)
min_y = int(bbox[SOUTH] / GRID_SIZE)
max_y = int(bbox[NORTH] / GRID_SIZE)
for x in range(min_x, max_x + 1):
for y in range(min_y, max_y + 1):
grid_key = grid_to_key(x, y)
if v := kv_get(curs, grid_key):
kv_put(curs, grid_key, v.append(entry_key))
else:
kv_put(curs, grid_key, [entry_key])
sparse_map_actions += 2
# log every one million items
if entry_key % int(1e5) == 0:
logger.info(f'{entry_key} items entered into the database')
logger.info(f'{num_entries} entries and {sparse_map_actions} sparse array items entered into'
f'the database')
if __name__ == '__main__':
num_entries = int(1e6 * 16) # sixteen million entries
num_points = 1000 # one thousand points
# run once and then disable
initialize_fake_data(num_entries)
# generate points for fake user input
points = create_user_input(num_points)
# benchmark bounding box tests across fake input
benchmark(points)
# one example run (will vary due to randomized test data):
# 16000000 entries and 391672460 sparse array items entered into the database
# database size: 2.4GB
# ~0.2 seconds to search on 10000 points (warm cache)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment