Created
September 29, 2025 21:00
-
-
Save lelandbatey/d09557fed38c48a797bf1b15cb844b75 to your computer and use it in GitHub Desktop.
A demonstration of the concepts ruling SQLite indexs (and many DB indeces implementations)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# in response to this conversation: https://news.ycombinator.com/item?id=45417191 | |
from pprint import pprint | |
# group_by here is ultimately used to create a tree structure (nested dictionaries) of the form | |
# keys[0] -> keys[1] -> ... -> keys[n] -> getid(object). We use it here to construct a tree data | |
# structure that's conceptually equal to a database index (not exactly the same, but shows the same | |
# properties we care about). | |
def group_by(objects, keys, getid=None): | |
if getid is None: | |
getid = lambda x: x['id'] | |
result = {} | |
for obj in objects: | |
current = result | |
for i, key in enumerate(keys): | |
value = obj.get(key) | |
if i == len(keys) - 1: | |
if value not in current: | |
current[value] = [] | |
current[value].append(getid(obj)) | |
else: | |
if value not in current: | |
current[value] = {} | |
current = current[value] | |
return result | |
things = [ | |
{"date": 10, "color": "red", "id": 101}, | |
{"date": 11, "color": "blue", "id": 111}, | |
{"date": 12, "color": "green", "id": 121}, | |
{"date": 13, "color": "red", "id": 131}, | |
{"date": 14, "color": "blue", "id": 141}, | |
{"date": 14, "color": "red", "id": 142}, | |
{"date": 15, "color": "yellow", "id": 151}, | |
{"date": 16, "color": "blue" , "id": 161}, | |
{"date": 17, "color": "green", "id": 171}, | |
{"date": 18, "color": "blue", "id": 181}, | |
{"date": 19, "color": "red", "id": 191}, | |
{"date": 20, "color": "green", "id": 201}, | |
] | |
indexed_by_id = dict() | |
for thing in things: | |
t_id = thing['id'] | |
indexed_by_id[t_id] = thing | |
# compound index going date -> color | |
by_date_then_color = group_by(things, ['date', 'color']) | |
pprint(by_date_then_color, sort_dicts=True) | |
# This is what the resulting date->color index looks like | |
# | |
# {10: {'red': [101]}, | |
# 11: {'blue': [111]}, | |
# 12: {'green': [121]}, | |
# 13: {'red': [131]}, | |
# 14: {'blue': [141], 'red': [142]}, | |
# 15: {'yellow':[151]}, | |
# 16: {'blue': [161]}, | |
# 17: {'green': [171]}, | |
# 18: {'blue': [181]}, | |
# 19: {'red': [191]}, | |
# 20: {'green': [201]}} | |
# | |
# Notice that it's not a very efficient index because it doesn't group things together a lot; | |
# there's not a lot of common ground among the 'date' fields of things in our data set, making | |
# 'date' a high cardinality column | |
# Rules for following the index: | |
# 1. Left to right | |
# 2. No skipping | |
# 3. Stops at the first range | |
# select thing_date, thing_color where thing_color = 'blue' and thing_date < 17 | |
# Since the index is date -> color -> thing, we have to check 'thing_date < 17' first, then we have | |
# to stop and walk through that set and individually check for 'color = blue', with this sequential | |
# checking being a small SCAN operation, but it's a scan of an index, not a SCAN of the DB. This is | |
# faster than scanning the entire DB, but it is still a small scan. | |
dates_lt_17 = [] # | |
for date in by_date_then_color.keys(): # WHERE thing_date < 17 | |
if date < 17: # | |
dates_lt_17.append(date) # | |
# now we have to scan this result and find the thing matching the data | |
results = list() # | |
for knowndate in dates_lt_17: # | |
by_colors = by_date_then_color[knowndate] # AND thing_color = 'blue' | |
if 'blue' in by_colors: # | |
for thing_id in by_colors['blue']: # | |
results.append(thing_id) # | |
pprint(results) | |
# Now that we've seen we needed to do a double-scan to find the answer, could we change our index to | |
# improve our performance? YES we can! If we flip the order of the columns in our index so they go | |
# color->date, we can scan fewer rows! | |
by_color_then_date = group_by(things, ['color', 'date']) | |
pprint(by_color_then_date, sort_dicts=True) | |
# This new index looks like the following: | |
# | |
# {'blue': {11: [111], | |
# 14: [141], | |
# 16: [161], | |
# 18: [181]}, | |
# 'green': {12: [121], | |
# 17: [171], | |
# 20: [201]}, | |
# 'red': {10: [101], | |
# 13: [131], | |
# 14: [142], | |
# 19: [191]}, | |
# 'yellow': {15: [151]}} | |
# Now we check the 'color = blue' first, and then we check the 'date < 17' | |
results = list() | |
blues = by_color_then_date['blue'] # This isn't a SCAN, it's one direct access! | |
# Now we have to SCAN and filter the blue things by 'date < 17' | |
for date in blues.keys(): # | |
if date < 17: # We stop and scan because this is a range: | |
for thing_id in blues[date]: # WHERE thing_date < 17 | |
results.append(thing_id) # | |
pprint(results) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment