Skip to content

Instantly share code, notes, and snippets.

@erikhatcher
Last active March 23, 2026 11:35
Show Gist options
  • Select an option

  • Save erikhatcher/8ec2def79e6a13fff013ea89066d464e to your computer and use it in GitHub Desktop.

Select an option

Save erikhatcher/8ec2def79e6a13fff013ea89066d464e to your computer and use it in GitHub Desktop.
B-Tree vs. search index intersection
const NUM_DOCS = 20000000;
const NUM_REPS = 50;
use('demo'); // TODO
collection = db.demo; // TODO
function get_pipeline_stats(pipeline) {
start = performance.now();
cursor = db.demo.aggregate(pipeline);
end = performance.now();
agg_time = end - start;
start = performance.now();
docs = cursor.toArray();
end = performance.now();
array_time = end - start;
// Explain output is generated outside the scope of the timings
explain = db.demo.explain("executionStats").aggregate(pipeline);
return {
pipeline: pipeline,
docs: docs,
agg_time: agg_time,
array_time: array_time,
total_time: agg_time + array_time,
explain: explain
};
}
function collect_stats(pipeline) {
stats = []
for (let j = 0; j < NUM_REPS; j++) {
stats.push(
get_pipeline_stats(pipeline)
);
}
return stats;
}
function calculate_stats(arr) {
if (arr.length === 0) {
return { minmax: undefined, average: undefined, min: undefined };
}
const min = Math.min(...arr);
const max = Math.max(...arr);
const sum = arr.reduce((total, currentVal) => total + currentVal, 0);
const average = sum / arr.length;
return { max, average, min };
}
// print("...from common.js")
output('Full run: setup...');
load('common.js');
let START = performance.now();
function output(text) {
// output text with timer (since first output)
print("\n[T+" + ((performance.now() - START)/1000) + "s] " + text);
}
function report(description, pipeline) {
stats = collect_stats(pipeline);
call_times = stats.map(item => (item.total_time));
output(
description + ':\n' +
JSON.stringify(pipeline) + '\n------\n' +
JSON.stringify(calculate_stats(call_times))
);
}
output("Dropping search index...")
try {
collection.dropSearchIndex('search_index');
output("Dropped search_index, sleeping 5 seconds to ensure it gets time to drop in the background");
sleep(5000); // give the index time to drop before we try to create it again
} catch (error) {
// no worries
}
output('Dropping B-tree indexes...');
try {
collection.dropIndex({ a: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ b: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ a_str: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ b_str: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ a: 1, b: 1, _id: 1 });
} catch (error) {
// no worries
}
output("Dropped b-tree indexes");
output('Counting total collection documents...');
total_docs = collection.countDocuments();
output('done counting total documents');
// With no indexes this will be a sluggish start, but we're just counting to report...
baseline_match_pipeline = [{ $match: { a: 42, b: 42 } }];
// execute an initial query, as a warmup
output('Executing warmup baseline query...');
get_pipeline_stats(baseline_match_pipeline);
output('...completed warmup baseline query');
// Now collect the stats
output('Executing baseline query...');
baseline_results = get_pipeline_stats(baseline_match_pipeline);
output('...completed baseline query');
output("Baseline stats, no indexes:\n" +
JSON.stringify(baseline_match_pipeline) + `\n` +
baseline_results.total_time + "ms total time\n" +
baseline_results.docs.length + " intersected documents out of " + total_docs
);
// Now let's create an index on `a` and see how that changes things.
// Note that we don't have an index on `b`, so the query can't fully take advantage of
// an index intersection, but it should be faster than the full
// collection scan since it can at least use the index on `a` to narrow down the
// candidates before filtering on `b`.
output("Creating index { a: 1 }...")
collection.createIndex({ a: 1 });
output("{ a: 1 } index created");
report("match with an index on `a`", baseline_match_pipeline);
output("Creating index { b: 1 }...")
collection.createIndex({ b: 1 });
output("{ b: 1 } index created");
report("match with an index on `a` and `b` independently", baseline_match_pipeline);
report('match + count, independent indexes',
[
{ $match: { a: 42, b: 42 }},
{ $count: 'num_found' }
]);
// Clearly our query is perfectly suited for a B-tree compound index. However, our original use case is to support a multitude of fields and combinations of them in queries
// making it infeasible to create indexes for every combination due to write performance. However, we'll revisit a compound index on this scenario later to show how it really is
// the best solution when the index intersection combinations are within write-performance acceptability.
// Let's leverage the inherent index intersection "skip list" efficient multi-field index navigation provided by a search index.
// Setting up the search index now so it'll be built by the time we run the search tests
collection.createSearchIndex(
"search_index",
{
mappings: {
dynamic: {
typeSet: "default_mappings"
}
},
typeSets: [
{
name: "default_mappings",
types: [
{ type: "number" },
{ type: "token" }
]
}
],
storedSource: { include: ["_id", "a", "b", "ab_str"] }
}
);
output("Search index creation initiated. Polling for index readiness...");
// poll for index query-ability
index_ready = false
while (!index_ready) {
sleep(1000); // wait for 1 second before retrying
try {
test_docs = collection.aggregate([
{
$search: {
index: 'search_index',
compound: {
filter: [
{ equals: { path: 'a', value: 42 } },
{ equals: { path: 'b', value: 42 } }
]
}
}
}
]).toArray();
output("Test query found " + test_docs.length + " docs");
index_ready = true; // if the query succeeds, we can exit the loop
} catch (error) {
// try again
}
}
output("Search index created and ready");
// Here's the best search index query for this intersection case, no sort, returning stored source to bypass id lookups, except no limit.
baseline_search_pipeline =
[
{
$search: {
index: 'search_index',
returnStoredSource: true, // bypass idLookup's
concurrent: true, // improve query performance on a typical churning segmented search index
compound: {
filter: [ // filters are non-scoring, slightly more performant than scoring clauses
{ equals: { path: 'a', value: 42 } },
{ equals: { path: 'b', value: 42 } }
]
}
}
}
]
report("search index query filtering on a and b independently", baseline_search_pipeline);
// Let's add sorting to both baselines and see the results:
sorted_baseline_match_pipeline = [
{ $match: { a: 42, b: 42 }},
{ $sort: { _id: 1 }}
]
report("Baseline match with _id sort with indexes already built only on a and b independently", sorted_baseline_match_pipeline);
sorted_baseline_search_pipeline =
[
{
$search: {
index: 'search_index',
returnStoredSource: true, // bypass idLookup's
concurrent: true, // improve query performance on a typical churning segmented search index
compound: {
filter: [ // filters are non-scoring, slightly more performant than scoring clauses
{ equals: { path: 'a', value: 42 } },
{ equals: { path: 'b', value: 42 } }
]
},
sort: {
_id: 1
}
}
}
]
report("Baseline search with _id sort", sorted_baseline_search_pipeline);
// Now with limits on each...
sorted_match_pipeline_with_limit = [
{ $match: { a: 42, b: 42 }},
{ $sort: { _id: 1 }},
{ $limit: 200 }
]
report("Match with _id sort and limit. Indexes already built on a and b, independently, at this point", sorted_match_pipeline_with_limit);
sorted_search_pipeline_with_limit =
[
{
$search: {
index: 'search_index',
returnStoredSource: true, // bypass idLookup's
concurrent: true, // improve query performance on a typical churning segmented search index
compound: {
filter: [ // filters are non-scoring, slightly more performant than scoring clauses
{ equals: { path: 'a', value: 42 } },
{ equals: { path: 'b', value: 42 } }
]
},
sort: {
_id: 1
}
}
},
{ $limit: 200 }
]
report("Search with _id sort and limit", sorted_search_pipeline_with_limit);
// Now let's build a compound B-tree index and see how it performs
output("Creating compound { a: 1, b: 1, _id: 1 } index...")
collection.createIndex({ a: 1, b: 1, _id: 1 });
output("{ a: 1, b: 1, _id: 1 } compound index created")
report("match with _id sort and limit with compound index", sorted_match_pipeline_with_limit);
report('search on compound field including sort and limit',
[
{
$search: {
index: 'search_index',
returnStoredSource: true,
concurrent: true,
equals: {
path: 'ab_str',
value: '42_42'
},
sort: {
_id: 1
}
}
},
{ $limit: 200 }
]);
report('search on compound field with sort and no limit, using stored source for returned results',
[
{
$search: {
index: 'search_index',
returnStoredSource: true,
concurrent: true,
equals: {
path: 'ab_str',
value: '42_42'
},
sort: {
_id: 1
}
}
}
]);
report('search on compound field sort and no limit, using idLookups for returned results',
[
{
$search: {
index: 'search_index',
returnStoredSource: false,
concurrent: true,
equals: {
path: 'ab_str',
value: '42_42'
},
sort: {
_id: 1
}
}
}
]);
report('match + count, numeric compound index',
[
{ $match: { a: 42, b: 42 }},
{ $count: 'num_found' }
]);
report('searchMeta total count by filter equals numeric',
[
{
$searchMeta: {
index: 'search_index',
concurrent: true,
compound: {
filter: [
{ equals: { path: 'a', value: 42 } },
{ equals: { path: 'b', value: 42 } }
]
},
count: {
type: "total"
}
}
}
]);
report('searchMeta total count on compound field',
[
{
$searchMeta: {
index: 'search_index',
concurrent: true,
equals: { path: 'ab_str', value: '42_42' },
count: {
type: "total"
}
}
}
]);
report('search + count using filter equals int',
[
{
$search: {
index: 'search_index',
returnStoredSource: true,
concurrent: true,
compound: {
filter: [
{ equals: { path: 'a', value: 42 } },
{ equals: { path: 'b', value: 42 } }
]
}
}
},
{ $count: 'num_found' }
]);
report('search + count using compound field',
[
{
$search: {
index: 'search_index',
returnStoredSource: true,
concurrent: true,
equals: { path: 'ab_str', value: '42_42' }
}
},
{ $count: 'num_found' }
]);
load('common.js');
print("Rebuilding collection of " + NUM_DOCS);
try {
collection.dropSearchIndex('search_index');
print(" * dropped search_index");
} catch (error) {
// no worries
}
try {
collection.dropIndex({ a: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ b: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ a_str: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ b_str: 1 });
} catch (error) {
// no worries
}
try {
collection.dropIndex({ a: 1, b: 1 });
} catch (error) {
// no worries
}
print(" * dropped b-tree indexes");
collection.deleteMany({});
print(" * deleted all docs");
print(" * generating " + NUM_DOCS + " docs into an in-memory array...");
let batch = [];
for (let i = 0; i < NUM_DOCS; i++) {
let a = Math.floor(Math.random()*100);
let b = Math.floor(Math.random()*100);
batch.push({
_id: i + 1,
a: a,
b: b,
ab_str: a.toString() + "_" + b.toString()
});
if (batch.length === 100000) {
print(" * inserting batch of " + batch.length + " documents...");
collection.insertMany(batch);
print(" * inserted " + batch.length + " documents");
batch = []; // free the memory, does this help mongosh have more memory?
}
}
if (batch.length > 0) {
collection.insertMany(batch);
}
print(" * inserted " + NUM_DOCS + " documents");
batch = []; // free the memory, does this help mongosh have more memory?
// Search index setup moved to run.js
// Setting up the search index now so it'll be built by the time we run the search tests
// collection.createSearchIndex(
// "search_index",
// {
// mappings: {
// dynamic: {
// typeSet: "default_mappings"
// }
// },
// typeSets: [
// {
// name: "default_mappings",
// types: [
// { type: "number" },
// { type: "token" }
// ]
// }
// ],
// storedSource: { include: ["_id", "a", "b", "ab_str"] }
// }
// )
// print(" * created search index");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment