Skip to content

Instantly share code, notes, and snippets.

@lelandbatey
Created September 29, 2025 21:00
Show Gist options
  • Save lelandbatey/d09557fed38c48a797bf1b15cb844b75 to your computer and use it in GitHub Desktop.
Save lelandbatey/d09557fed38c48a797bf1b15cb844b75 to your computer and use it in GitHub Desktop.
A demonstration of the concepts ruling SQLite indexs (and many DB indeces implementations)
# in response to this conversation: https://news.ycombinator.com/item?id=45417191
from pprint import pprint
# group_by here is ultimately used to create a tree structure (nested dictionaries) of the form
# keys[0] -> keys[1] -> ... -> keys[n] -> getid(object). We use it here to construct a tree data
# structure that's conceptually equal to a database index (not exactly the same, but shows the same
# properties we care about).
def group_by(objects, keys, getid=None):
if getid is None:
getid = lambda x: x['id']
result = {}
for obj in objects:
current = result
for i, key in enumerate(keys):
value = obj.get(key)
if i == len(keys) - 1:
if value not in current:
current[value] = []
current[value].append(getid(obj))
else:
if value not in current:
current[value] = {}
current = current[value]
return result
things = [
{"date": 10, "color": "red", "id": 101},
{"date": 11, "color": "blue", "id": 111},
{"date": 12, "color": "green", "id": 121},
{"date": 13, "color": "red", "id": 131},
{"date": 14, "color": "blue", "id": 141},
{"date": 14, "color": "red", "id": 142},
{"date": 15, "color": "yellow", "id": 151},
{"date": 16, "color": "blue" , "id": 161},
{"date": 17, "color": "green", "id": 171},
{"date": 18, "color": "blue", "id": 181},
{"date": 19, "color": "red", "id": 191},
{"date": 20, "color": "green", "id": 201},
]
indexed_by_id = dict()
for thing in things:
t_id = thing['id']
indexed_by_id[t_id] = thing
# compound index going date -> color
by_date_then_color = group_by(things, ['date', 'color'])
pprint(by_date_then_color, sort_dicts=True)
# This is what the resulting date->color index looks like
#
# {10: {'red': [101]},
# 11: {'blue': [111]},
# 12: {'green': [121]},
# 13: {'red': [131]},
# 14: {'blue': [141], 'red': [142]},
# 15: {'yellow':[151]},
# 16: {'blue': [161]},
# 17: {'green': [171]},
# 18: {'blue': [181]},
# 19: {'red': [191]},
# 20: {'green': [201]}}
#
# Notice that it's not a very efficient index because it doesn't group things together a lot;
# there's not a lot of common ground among the 'date' fields of things in our data set, making
# 'date' a high cardinality column
# Rules for following the index:
# 1. Left to right
# 2. No skipping
# 3. Stops at the first range
# select thing_date, thing_color where thing_color = 'blue' and thing_date < 17
# Since the index is date -> color -> thing, we have to check 'thing_date < 17' first, then we have
# to stop and walk through that set and individually check for 'color = blue', with this sequential
# checking being a small SCAN operation, but it's a scan of an index, not a SCAN of the DB. This is
# faster than scanning the entire DB, but it is still a small scan.
dates_lt_17 = [] #
for date in by_date_then_color.keys(): # WHERE thing_date < 17
if date < 17: #
dates_lt_17.append(date) #
# now we have to scan this result and find the thing matching the data
results = list() #
for knowndate in dates_lt_17: #
by_colors = by_date_then_color[knowndate] # AND thing_color = 'blue'
if 'blue' in by_colors: #
for thing_id in by_colors['blue']: #
results.append(thing_id) #
pprint(results)
# Now that we've seen we needed to do a double-scan to find the answer, could we change our index to
# improve our performance? YES we can! If we flip the order of the columns in our index so they go
# color->date, we can scan fewer rows!
by_color_then_date = group_by(things, ['color', 'date'])
pprint(by_color_then_date, sort_dicts=True)
# This new index looks like the following:
#
# {'blue': {11: [111],
# 14: [141],
# 16: [161],
# 18: [181]},
# 'green': {12: [121],
# 17: [171],
# 20: [201]},
# 'red': {10: [101],
# 13: [131],
# 14: [142],
# 19: [191]},
# 'yellow': {15: [151]}}
# Now we check the 'color = blue' first, and then we check the 'date < 17'
results = list()
blues = by_color_then_date['blue'] # This isn't a SCAN, it's one direct access!
# Now we have to SCAN and filter the blue things by 'date < 17'
for date in blues.keys(): #
if date < 17: # We stop and scan because this is a range:
for thing_id in blues[date]: # WHERE thing_date < 17
results.append(thing_id) #
pprint(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment