Last active
March 23, 2026 11:35
-
-
Save erikhatcher/8ec2def79e6a13fff013ea89066d464e to your computer and use it in GitHub Desktop.
B-Tree vs. search index intersection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const NUM_DOCS = 20000000; | |
| const NUM_REPS = 50; | |
| use('demo'); // TODO | |
| collection = db.demo; // TODO | |
| function get_pipeline_stats(pipeline) { | |
| start = performance.now(); | |
| cursor = db.demo.aggregate(pipeline); | |
| end = performance.now(); | |
| agg_time = end - start; | |
| start = performance.now(); | |
| docs = cursor.toArray(); | |
| end = performance.now(); | |
| array_time = end - start; | |
| // Explain output is generated outside the scope of the timings | |
| explain = db.demo.explain("executionStats").aggregate(pipeline); | |
| return { | |
| pipeline: pipeline, | |
| docs: docs, | |
| agg_time: agg_time, | |
| array_time: array_time, | |
| total_time: agg_time + array_time, | |
| explain: explain | |
| }; | |
| } | |
| function collect_stats(pipeline) { | |
| stats = [] | |
| for (let j = 0; j < NUM_REPS; j++) { | |
| stats.push( | |
| get_pipeline_stats(pipeline) | |
| ); | |
| } | |
| return stats; | |
| } | |
| function calculate_stats(arr) { | |
| if (arr.length === 0) { | |
| return { minmax: undefined, average: undefined, min: undefined }; | |
| } | |
| const min = Math.min(...arr); | |
| const max = Math.max(...arr); | |
| const sum = arr.reduce((total, currentVal) => total + currentVal, 0); | |
| const average = sum / arr.length; | |
| return { max, average, min }; | |
| } | |
| // print("...from common.js") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| output('Full run: setup...'); | |
| load('common.js'); | |
| let START = performance.now(); | |
| function output(text) { | |
| // output text with timer (since first output) | |
| print("\n[T+" + ((performance.now() - START)/1000) + "s] " + text); | |
| } | |
| function report(description, pipeline) { | |
| stats = collect_stats(pipeline); | |
| call_times = stats.map(item => (item.total_time)); | |
| output( | |
| description + ':\n' + | |
| JSON.stringify(pipeline) + '\n------\n' + | |
| JSON.stringify(calculate_stats(call_times)) | |
| ); | |
| } | |
| output("Dropping search index...") | |
| try { | |
| collection.dropSearchIndex('search_index'); | |
| output("Dropped search_index, sleeping 5 seconds to ensure it gets time to drop in the background"); | |
| sleep(5000); // give the index time to drop before we try to create it again | |
| } catch (error) { | |
| // no worries | |
| } | |
| output('Dropping B-tree indexes...'); | |
| try { | |
| collection.dropIndex({ a: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ b: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ a_str: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ b_str: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ a: 1, b: 1, _id: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| output("Dropped b-tree indexes"); | |
| output('Counting total collection documents...'); | |
| total_docs = collection.countDocuments(); | |
| output('done counting total documents'); | |
| // With no indexes this will be a sluggish start, but we're just counting to report... | |
| baseline_match_pipeline = [{ $match: { a: 42, b: 42 } }]; | |
| // execute an initial query, as a warmup | |
| output('Executing warmup baseline query...'); | |
| get_pipeline_stats(baseline_match_pipeline); | |
| output('...completed warmup baseline query'); | |
| // Now collect the stats | |
| output('Executing baseline query...'); | |
| baseline_results = get_pipeline_stats(baseline_match_pipeline); | |
| output('...completed baseline query'); | |
| output("Baseline stats, no indexes:\n" + | |
| JSON.stringify(baseline_match_pipeline) + `\n` + | |
| baseline_results.total_time + "ms total time\n" + | |
| baseline_results.docs.length + " intersected documents out of " + total_docs | |
| ); | |
| // Now let's create an index on `a` and see how that changes things. | |
| // Note that we don't have an index on `b`, so the query can't fully take advantage of | |
| // an index intersection, but it should be faster than the full | |
| // collection scan since it can at least use the index on `a` to narrow down the | |
| // candidates before filtering on `b`. | |
| output("Creating index { a: 1 }...") | |
| collection.createIndex({ a: 1 }); | |
| output("{ a: 1 } index created"); | |
| report("match with an index on `a`", baseline_match_pipeline); | |
| output("Creating index { b: 1 }...") | |
| collection.createIndex({ b: 1 }); | |
| output("{ b: 1 } index created"); | |
| report("match with an index on `a` and `b` independently", baseline_match_pipeline); | |
| report('match + count, independent indexes', | |
| [ | |
| { $match: { a: 42, b: 42 }}, | |
| { $count: 'num_found' } | |
| ]); | |
| // Clearly our query is perfectly suited for a B-tree compound index. However, our original use case is to support a multitude of fields and combinations of them in queries | |
| // making it infeasible to create indexes for every combination due to write performance. However, we'll revisit a compound index on this scenario later to show how it really is | |
| // the best solution when the index intersection combinations are within write-performance acceptability. | |
| // Let's leverage the inherent index intersection "skip list" efficient multi-field index navigation provided by a search index. | |
| // Setting up the search index now so it'll be built by the time we run the search tests | |
| collection.createSearchIndex( | |
| "search_index", | |
| { | |
| mappings: { | |
| dynamic: { | |
| typeSet: "default_mappings" | |
| } | |
| }, | |
| typeSets: [ | |
| { | |
| name: "default_mappings", | |
| types: [ | |
| { type: "number" }, | |
| { type: "token" } | |
| ] | |
| } | |
| ], | |
| storedSource: { include: ["_id", "a", "b", "ab_str"] } | |
| } | |
| ); | |
| output("Search index creation initiated. Polling for index readiness..."); | |
| // poll for index query-ability | |
| index_ready = false | |
| while (!index_ready) { | |
| sleep(1000); // wait for 1 second before retrying | |
| try { | |
| test_docs = collection.aggregate([ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| compound: { | |
| filter: [ | |
| { equals: { path: 'a', value: 42 } }, | |
| { equals: { path: 'b', value: 42 } } | |
| ] | |
| } | |
| } | |
| } | |
| ]).toArray(); | |
| output("Test query found " + test_docs.length + " docs"); | |
| index_ready = true; // if the query succeeds, we can exit the loop | |
| } catch (error) { | |
| // try again | |
| } | |
| } | |
| output("Search index created and ready"); | |
| // Here's the best search index query for this intersection case, no sort, returning stored source to bypass id lookups, except no limit. | |
| baseline_search_pipeline = | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, // bypass idLookup's | |
| concurrent: true, // improve query performance on a typical churning segmented search index | |
| compound: { | |
| filter: [ // filters are non-scoring, slightly more performant than scoring clauses | |
| { equals: { path: 'a', value: 42 } }, | |
| { equals: { path: 'b', value: 42 } } | |
| ] | |
| } | |
| } | |
| } | |
| ] | |
| report("search index query filtering on a and b independently", baseline_search_pipeline); | |
| // Let's add sorting to both baselines and see the results: | |
| sorted_baseline_match_pipeline = [ | |
| { $match: { a: 42, b: 42 }}, | |
| { $sort: { _id: 1 }} | |
| ] | |
| report("Baseline match with _id sort with indexes already built only on a and b independently", sorted_baseline_match_pipeline); | |
| sorted_baseline_search_pipeline = | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, // bypass idLookup's | |
| concurrent: true, // improve query performance on a typical churning segmented search index | |
| compound: { | |
| filter: [ // filters are non-scoring, slightly more performant than scoring clauses | |
| { equals: { path: 'a', value: 42 } }, | |
| { equals: { path: 'b', value: 42 } } | |
| ] | |
| }, | |
| sort: { | |
| _id: 1 | |
| } | |
| } | |
| } | |
| ] | |
| report("Baseline search with _id sort", sorted_baseline_search_pipeline); | |
| // Now with limits on each... | |
| sorted_match_pipeline_with_limit = [ | |
| { $match: { a: 42, b: 42 }}, | |
| { $sort: { _id: 1 }}, | |
| { $limit: 200 } | |
| ] | |
| report("Match with _id sort and limit. Indexes already built on a and b, independently, at this point", sorted_match_pipeline_with_limit); | |
| sorted_search_pipeline_with_limit = | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, // bypass idLookup's | |
| concurrent: true, // improve query performance on a typical churning segmented search index | |
| compound: { | |
| filter: [ // filters are non-scoring, slightly more performant than scoring clauses | |
| { equals: { path: 'a', value: 42 } }, | |
| { equals: { path: 'b', value: 42 } } | |
| ] | |
| }, | |
| sort: { | |
| _id: 1 | |
| } | |
| } | |
| }, | |
| { $limit: 200 } | |
| ] | |
| report("Search with _id sort and limit", sorted_search_pipeline_with_limit); | |
| // Now let's build a compound B-tree index and see how it performs | |
| output("Creating compound { a: 1, b: 1, _id: 1 } index...") | |
| collection.createIndex({ a: 1, b: 1, _id: 1 }); | |
| output("{ a: 1, b: 1, _id: 1 } compound index created") | |
| report("match with _id sort and limit with compound index", sorted_match_pipeline_with_limit); | |
| report('search on compound field including sort and limit', | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, | |
| concurrent: true, | |
| equals: { | |
| path: 'ab_str', | |
| value: '42_42' | |
| }, | |
| sort: { | |
| _id: 1 | |
| } | |
| } | |
| }, | |
| { $limit: 200 } | |
| ]); | |
| report('search on compound field with sort and no limit, using stored source for returned results', | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, | |
| concurrent: true, | |
| equals: { | |
| path: 'ab_str', | |
| value: '42_42' | |
| }, | |
| sort: { | |
| _id: 1 | |
| } | |
| } | |
| } | |
| ]); | |
| report('search on compound field sort and no limit, using idLookups for returned results', | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: false, | |
| concurrent: true, | |
| equals: { | |
| path: 'ab_str', | |
| value: '42_42' | |
| }, | |
| sort: { | |
| _id: 1 | |
| } | |
| } | |
| } | |
| ]); | |
| report('match + count, numeric compound index', | |
| [ | |
| { $match: { a: 42, b: 42 }}, | |
| { $count: 'num_found' } | |
| ]); | |
| report('searchMeta total count by filter equals numeric', | |
| [ | |
| { | |
| $searchMeta: { | |
| index: 'search_index', | |
| concurrent: true, | |
| compound: { | |
| filter: [ | |
| { equals: { path: 'a', value: 42 } }, | |
| { equals: { path: 'b', value: 42 } } | |
| ] | |
| }, | |
| count: { | |
| type: "total" | |
| } | |
| } | |
| } | |
| ]); | |
| report('searchMeta total count on compound field', | |
| [ | |
| { | |
| $searchMeta: { | |
| index: 'search_index', | |
| concurrent: true, | |
| equals: { path: 'ab_str', value: '42_42' }, | |
| count: { | |
| type: "total" | |
| } | |
| } | |
| } | |
| ]); | |
| report('search + count using filter equals int', | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, | |
| concurrent: true, | |
| compound: { | |
| filter: [ | |
| { equals: { path: 'a', value: 42 } }, | |
| { equals: { path: 'b', value: 42 } } | |
| ] | |
| } | |
| } | |
| }, | |
| { $count: 'num_found' } | |
| ]); | |
| report('search + count using compound field', | |
| [ | |
| { | |
| $search: { | |
| index: 'search_index', | |
| returnStoredSource: true, | |
| concurrent: true, | |
| equals: { path: 'ab_str', value: '42_42' } | |
| } | |
| }, | |
| { $count: 'num_found' } | |
| ]); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| load('common.js'); | |
| print("Rebuilding collection of " + NUM_DOCS); | |
| try { | |
| collection.dropSearchIndex('search_index'); | |
| print(" * dropped search_index"); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ a: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ b: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ a_str: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ b_str: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| try { | |
| collection.dropIndex({ a: 1, b: 1 }); | |
| } catch (error) { | |
| // no worries | |
| } | |
| print(" * dropped b-tree indexes"); | |
| collection.deleteMany({}); | |
| print(" * deleted all docs"); | |
| print(" * generating " + NUM_DOCS + " docs into an in-memory array..."); | |
| let batch = []; | |
| for (let i = 0; i < NUM_DOCS; i++) { | |
| let a = Math.floor(Math.random()*100); | |
| let b = Math.floor(Math.random()*100); | |
| batch.push({ | |
| _id: i + 1, | |
| a: a, | |
| b: b, | |
| ab_str: a.toString() + "_" + b.toString() | |
| }); | |
| if (batch.length === 100000) { | |
| print(" * inserting batch of " + batch.length + " documents..."); | |
| collection.insertMany(batch); | |
| print(" * inserted " + batch.length + " documents"); | |
| batch = []; // free the memory, does this help mongosh have more memory? | |
| } | |
| } | |
| if (batch.length > 0) { | |
| collection.insertMany(batch); | |
| } | |
| print(" * inserted " + NUM_DOCS + " documents"); | |
| batch = []; // free the memory, does this help mongosh have more memory? | |
| // Search index setup moved to run.js | |
| // Setting up the search index now so it'll be built by the time we run the search tests | |
| // collection.createSearchIndex( | |
| // "search_index", | |
| // { | |
| // mappings: { | |
| // dynamic: { | |
| // typeSet: "default_mappings" | |
| // } | |
| // }, | |
| // typeSets: [ | |
| // { | |
| // name: "default_mappings", | |
| // types: [ | |
| // { type: "number" }, | |
| // { type: "token" } | |
| // ] | |
| // } | |
| // ], | |
| // storedSource: { include: ["_id", "a", "b", "ab_str"] } | |
| // } | |
| // ) | |
| // print(" * created search index"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment