Skip to content

Instantly share code, notes, and snippets.

@apage43
Last active September 25, 2024 17:39
Show Gist options
  • Save apage43/436d3198c48d9a1a20a0c34bd98a527d to your computer and use it in GitHub Desktop.
Save apage43/436d3198c48d9a1a20a0c34bd98a527d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections import deque\n",
"from dataclasses import dataclass, field\n",
"from typing import List, Any"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Build b+-trees of items indexed with uuids and ulids"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# rebuild + search only, insert not implemented\n",
"@dataclass\n",
"class BTreeNode:\n",
" is_leaf: bool = False\n",
" keys: List[str] = field(default_factory=lambda: [])\n",
" children: List[Any] = field(default_factory=lambda: [])\n",
"\n",
"def build_btree(kvs, m=100):\n",
" # typically compaction/rebuild leaves inner nodes half-full\n",
" # to leave room for inserts\n",
" inner_size = m // 2 \n",
" ordered = sorted(kvs)\n",
" def mkleaves():\n",
" for i in range(0, len(ordered), m):\n",
" chunk = ordered[i:i+m]\n",
" keys = [k for k, _ in chunk]\n",
" values = [v for _, v in chunk]\n",
" yield BTreeNode(is_leaf=True, keys=keys, children=values)\n",
" nodes = list(mkleaves())\n",
" print(f'tree has {len(nodes)} leaves')\n",
" def next_layer():\n",
" for i in range(0, len(nodes), inner_size):\n",
" chunk = nodes[i:i+inner_size]\n",
" keys = [n.keys[-1] for n in chunk]\n",
" yield BTreeNode(is_leaf=False, keys=keys, children=chunk)\n",
" while len(nodes) > 1:\n",
" nodes = list(next_layer())\n",
" return nodes[0]\n",
"\n",
"@dataclass\n",
"class TraversalStats:\n",
" nodes_visited: int = 0\n",
" leaves_visited: int = 0\n",
"\n",
"def btree_search_recursive(node, keys, stats=None):\n",
" if stats is None:\n",
" stats = TraversalStats()\n",
" if not keys:\n",
" return {}, stats\n",
" stats.nodes_visited += 1\n",
" if node.is_leaf:\n",
" stats.leaves_visited += 1\n",
" keys = deque(sorted(keys))\n",
" outputs = {}\n",
" for i, nkey in enumerate(node.keys):\n",
" if len(keys) == 0:\n",
" break\n",
" dkeys = []\n",
" if not node.is_leaf:\n",
" while len(keys) > 0 and keys[0] <= nkey:\n",
" dkeys.append(keys.popleft())\n",
" if len(dkeys) > 0:\n",
" iout, _ = btree_search_recursive(node.children[i], dkeys, stats)\n",
" outputs.update(iout)\n",
" else:\n",
" if nkey == keys[0]:\n",
" keys.popleft()\n",
" outputs[nkey] = node.children[i]\n",
" return outputs, stats"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import uuid\n",
"import ulid"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"dsize = 5000\n",
"uuid_testdata = [(str(uuid.uuid4()), f\"item {n}\") for n in range(dsize)]\n",
"ulid_testdata = [(ulid.ulid(), f\"item {n}\") for n in range(dsize)]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tree has 50 leaves\n",
"tree has 50 leaves\n"
]
}
],
"source": [
"tree_uu = build_btree(uuid_testdata)\n",
"keys_uu = [k for k, _ in uuid_testdata]\n",
"tree_ul = build_btree(ulid_testdata)\n",
"keys_ul = [k for k, _ in ulid_testdata]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(20, 20)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import random\n",
"testsize = 20\n",
"random_indices = [random.randint(0, dsize-1) for _ in range(testsize)]\n",
"seqlen = 5\n",
"random_sequential = [start + i \n",
" for start in [random.randint(0, dsize-1-seqlen) for _ in range(testsize//seqlen)]\n",
" for i in range(seqlen)]\n",
"len(random_indices), len(random_sequential)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"For totally random indices we always have to visit nearly as many leaves as query keys - but if they are at somewhat correlated we can visit significantly fewer nodes if they have less random IDs"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TraversalStats(nodes_visited=18, leaves_visited=17)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 20 random indices, uuids\n",
"_, stats = btree_search_recursive(tree_uu, [keys_uu[i] for i in random_indices])\n",
"stats"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TraversalStats(nodes_visited=15, leaves_visited=14)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 20 semi-sequentual indices, uuids\n",
"_, stats = btree_search_recursive(tree_uu, [keys_uu[i] for i in random_sequential])\n",
"stats"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TraversalStats(nodes_visited=18, leaves_visited=17)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 20 random indices, ULids\n",
"_, stats = btree_search_recursive(tree_ul, [keys_ul[i] for i in random_indices])\n",
"stats"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TraversalStats(nodes_visited=5, leaves_visited=4)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 20 semi-sequentual indices, ULids\n",
"_, stats = btree_search_recursive(tree_ul, [keys_ul[i] for i in random_sequential])\n",
"stats"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"This only helps when the b-tree is queried in batches, if each key is queried sequentially\n",
"we read every node in the path for every item even if some items may share some or \n",
"all of the path"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TraversalStats(nodes_visited=40, leaves_visited=20)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# random\n",
"stats = TraversalStats()\n",
"for i in random_indices:\n",
" btree_search_recursive(tree_ul, [keys_ul[i]], stats)\n",
"stats\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TraversalStats(nodes_visited=40, leaves_visited=20)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and semi-seq\n",
"stats = TraversalStats()\n",
"for i in random_sequential:\n",
" btree_search_recursive(tree_ul, [keys_ul[i]], stats)\n",
"stats"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment