lelandbatey · September 29, 2025 21:00
diff --git a/sqlite_index_rules_demo.py b/sqlite_index_rules_demo.py
 # in response to this conversation: https://news.ycombinator.com/item?id=45417191
 from pprint import pprint
 # group_by here is ultimately used to create a tree structure (nested dictionaries) of the form
 # keys[0] -> keys[1] -> ... -> keys[n] -> getid(object). We use it here to construct a tree data
 # structure that's conceptually equal to a database index (not exactly the same, but shows the same
 # properties we care about).
 def group_by(objects, keys, getid=None):
    if getid is None:
        getid = lambda x: x['id']
    result = {}
    for obj in objects:
        current = result
        for i, key in enumerate(keys):
            value = obj.get(key)

            if i == len(keys) - 1:
                if value not in current:
                    current[value] = []
                current[value].append(getid(obj))
            else:
                if value not in current:
                    current[value] = {}
                current = current[value]
    return result

 things = [
    {"date": 10, "color": "red",    "id": 101},
    {"date": 11, "color": "blue",   "id": 111},
    {"date": 12, "color": "green",  "id": 121},
    {"date": 13, "color": "red",    "id": 131},
    {"date": 14, "color": "blue",   "id": 141},
    {"date": 14, "color": "red",    "id": 142},
    {"date": 15, "color": "yellow", "id": 151},
    {"date": 16, "color": "blue" ,  "id": 161},
    {"date": 17, "color": "green",  "id": 171},
    {"date": 18, "color": "blue",   "id": 181},
    {"date": 19, "color": "red",    "id": 191},
    {"date": 20, "color": "green",  "id": 201},
 ]

 indexed_by_id = dict()
 for thing in things:
    t_id = thing['id']
    indexed_by_id[t_id] = thing

 # compound index going date -> color
 by_date_then_color = group_by(things, ['date', 'color'])

 pprint(by_date_then_color, sort_dicts=True)
 # This is what the resulting date->color index looks like
 #
 #     {10: {'red':   [101]},
 #      11: {'blue':  [111]},
 #      12: {'green': [121]},
 #      13: {'red':   [131]},
 #      14: {'blue':  [141], 'red': [142]},
 #      15: {'yellow':[151]},
 #      16: {'blue':  [161]},
 #      17: {'green': [171]},
 #      18: {'blue':  [181]},
 #      19: {'red':   [191]},
 #      20: {'green': [201]}}
 #
 # Notice that it's not a very efficient index because it doesn't group things together a lot;
 # there's not a lot of common ground among the 'date' fields of things in our data set, making
 # 'date' a high cardinality column

 # Rules for following the index:
 # 1. Left to right
 # 2. No skipping
 # 3. Stops at the first range

 # select thing_date, thing_color where thing_color = 'blue' and thing_date < 17

 #  Since the index is date -> color -> thing, we have to check 'thing_date < 17' first, then we have
 #  to stop and walk through that set and individually check for 'color = blue', with this sequential
 #  checking being a small SCAN operation, but it's a scan of an index, not a SCAN of the DB. This is
 #  faster than scanning the entire DB, but it is still a small scan.
 dates_lt_17 = []                       #
 for date in by_date_then_color.keys(): #  WHERE thing_date < 17
    if date < 17:                      #
        dates_lt_17.append(date)       #
 # now we have to scan this result and find the thing matching the data
 results = list()                               #
 for knowndate in dates_lt_17:                  #
    by_colors = by_date_then_color[knowndate]  #  AND thing_color = 'blue'
    if 'blue' in by_colors:                    #
        for thing_id in by_colors['blue']:     #
            results.append(thing_id)           #
 pprint(results)


 # Now that we've seen we needed to do a double-scan to find the answer, could we change our index to
 # improve our performance? YES we can! If we flip the order of the columns in our index so they go
 # color->date, we can scan fewer rows!
 by_color_then_date = group_by(things, ['color', 'date'])
 pprint(by_color_then_date, sort_dicts=True)
 # This new index looks like the following:
 #
 #     {'blue': {11: [111],
 #               14: [141],
 #               16: [161],
 #               18: [181]},
 #     'green': {12: [121],
 #               17: [171],
 #               20: [201]},
 #     'red': {10: [101],
 #             13: [131],
 #             14: [142],
 #             19: [191]},
 #     'yellow': {15: [151]}}

 # Now we check the 'color = blue' first, and then we check the 'date < 17'
 results = list()
 blues = by_color_then_date['blue'] # This isn't a SCAN, it's one direct access!
 # Now we have to SCAN and filter the blue things by 'date < 17'
 for date in blues.keys():             #
    if date < 17:                     # We stop and scan because this is a range:
        for thing_id in blues[date]:  #  WHERE thing_date < 17
            results.append(thing_id)  #
 pprint(results)
	# in response to this conversation: https://news.ycombinator.com/item?id=45417191
	from pprint import pprint
	# group_by here is ultimately used to create a tree structure (nested dictionaries) of the form
	# keys[0] -> keys[1] -> ... -> keys[n] -> getid(object). We use it here to construct a tree data
	# structure that's conceptually equal to a database index (not exactly the same, but shows the same
	# properties we care about).
	def group_by(objects, keys, getid=None):
	if getid is None:
	getid = lambda x: x['id']
	result = {}
	for obj in objects:
	current = result
	for i, key in enumerate(keys):
	value = obj.get(key)

	if i == len(keys) - 1:
	if value not in current:
	current[value] = []
	current[value].append(getid(obj))
	else:
	if value not in current:
	current[value] = {}
	current = current[value]
	return result

	things = [
	{"date": 10, "color": "red", "id": 101},
	{"date": 11, "color": "blue", "id": 111},
	{"date": 12, "color": "green", "id": 121},
	{"date": 13, "color": "red", "id": 131},
	{"date": 14, "color": "blue", "id": 141},
	{"date": 14, "color": "red", "id": 142},
	{"date": 15, "color": "yellow", "id": 151},
	{"date": 16, "color": "blue" , "id": 161},
	{"date": 17, "color": "green", "id": 171},
	{"date": 18, "color": "blue", "id": 181},
	{"date": 19, "color": "red", "id": 191},
	{"date": 20, "color": "green", "id": 201},
	]

	indexed_by_id = dict()
	for thing in things:
	t_id = thing['id']
	indexed_by_id[t_id] = thing

	# compound index going date -> color
	by_date_then_color = group_by(things, ['date', 'color'])

	pprint(by_date_then_color, sort_dicts=True)
	# This is what the resulting date->color index looks like
	#
	# {10: {'red': [101]},
	# 11: {'blue': [111]},
	# 12: {'green': [121]},
	# 13: {'red': [131]},
	# 14: {'blue': [141], 'red': [142]},
	# 15: {'yellow':[151]},
	# 16: {'blue': [161]},
	# 17: {'green': [171]},
	# 18: {'blue': [181]},
	# 19: {'red': [191]},
	# 20: {'green': [201]}}
	#
	# Notice that it's not a very efficient index because it doesn't group things together a lot;
	# there's not a lot of common ground among the 'date' fields of things in our data set, making
	# 'date' a high cardinality column

	# Rules for following the index:
	# 1. Left to right
	# 2. No skipping
	# 3. Stops at the first range

	# select thing_date, thing_color where thing_color = 'blue' and thing_date < 17

	# Since the index is date -> color -> thing, we have to check 'thing_date < 17' first, then we have
	# to stop and walk through that set and individually check for 'color = blue', with this sequential
	# checking being a small SCAN operation, but it's a scan of an index, not a SCAN of the DB. This is
	# faster than scanning the entire DB, but it is still a small scan.
	dates_lt_17 = [] #
	for date in by_date_then_color.keys(): # WHERE thing_date < 17
	if date < 17: #
	dates_lt_17.append(date) #
	# now we have to scan this result and find the thing matching the data
	results = list() #
	for knowndate in dates_lt_17: #
	by_colors = by_date_then_color[knowndate] # AND thing_color = 'blue'
	if 'blue' in by_colors: #
	for thing_id in by_colors['blue']: #
	results.append(thing_id) #
	pprint(results)


	# Now that we've seen we needed to do a double-scan to find the answer, could we change our index to
	# improve our performance? YES we can! If we flip the order of the columns in our index so they go
	# color->date, we can scan fewer rows!
	by_color_then_date = group_by(things, ['color', 'date'])
	pprint(by_color_then_date, sort_dicts=True)
	# This new index looks like the following:
	#
	# {'blue': {11: [111],
	# 14: [141],
	# 16: [161],
	# 18: [181]},
	# 'green': {12: [121],
	# 17: [171],
	# 20: [201]},
	# 'red': {10: [101],
	# 13: [131],
	# 14: [142],
	# 19: [191]},
	# 'yellow': {15: [151]}}

	# Now we check the 'color = blue' first, and then we check the 'date < 17'
	results = list()
	blues = by_color_then_date['blue'] # This isn't a SCAN, it's one direct access!
	# Now we have to SCAN and filter the blue things by 'date < 17'
	for date in blues.keys(): #
	if date < 17: # We stop and scan because this is a range:
	for thing_id in blues[date]: # WHERE thing_date < 17
	results.append(thing_id) #
	pprint(results)