Skip to content

Instantly share code, notes, and snippets.

@JasonStoltz
Last active April 27, 2022 13:25
Show Gist options
  • Save JasonStoltz/41b29e71310743e94275fc02fac33095 to your computer and use it in GitHub Desktop.
Save JasonStoltz/41b29e71310743e94275fc02fac33095 to your computer and use it in GitHub Desktop.
App Search CLI
#!/usr/bin/env node
/**
* Install:
*
* Download this file and give it executuable persmissions
*
* Usage:
*
* Uploading documents
* ---------
*
* app-search-cli upload ~/Documents/pokemon.json http://localhost:3002 test-engine private-kayik65qbd516q1brk724aaa
*
* Concurrency and batch size can be adjusted manually in code below
*/
const fs = require("fs");
const [command, pathToJson, apiPath, engineName, apiKey] = process.argv.slice(
2
);
console.log("command: ", command);
console.log("pathToJson: ", pathToJson);
console.log("apiPath: ", apiPath);
console.log("engineName: ", engineName);
console.log("apiKey: ", apiKey);
const apiBase = `${apiPath}/api/as/v1`;
const documentsUrl = `${apiBase}/engines/${engineName}/documents`;
// Adapted from: https://www.tomas-dvorak.cz/posts/nodejs-request-without-dependencies/
const request = function(url, options = {}, requestBody = "") {
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
const lib = url.startsWith("https") ? require("https") : require("http");
const request = lib.request(url, options, response => {
// temporary data holder
const body = [];
// on every content chunk, push it to the data array
response.on("data", chunk => body.push(chunk));
// we are done, resolve promise with those joined chunks
response.on("end", () => {
// handle http errors
if (response.statusCode < 200 || response.statusCode > 299) {
reject(
new Error(`API Error: ${response.statusCode} ${body.join(" ")}`)
);
}
resolve(body.join(""));
});
});
// handle connection errors of the request
request.on("error", err => reject(err));
request.write(requestBody);
request.end();
});
};
/*
Documents can only be indexed 100 at a time, so we index
our data set in batches. The following is a very simple batching function, which
allows for has a configurable `concurrency` variable, which allows for faster
indexing.
*/
function indexDocumentsInBatches(documents) {
debugger;
const concurrency = 20;
const size = 100;
let start = 0;
let end = start + size;
let recordsIndexed = 0;
function nextBatch() {
if (start > documents.length) return [];
console.log(`processing batch ${start} to ${end}`);
const batch = documents.slice(start, end);
start += size;
end += size;
return batch;
}
async function batchChain() {
let batch = nextBatch();
const batchString = JSON.stringify(batch);
if (batch.length === 0) return;
try {
await request(
documentsUrl,
{
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(batchString)
}
},
batchString
);
recordsIndexed += batch.length;
if (start < documents.length) {
return batchChain();
}
} catch (e) {
console.log(e);
process.exit();
}
}
for (let i = 0; i < concurrency; i++) {
batchChain();
}
let exitOnNextTick = false;
setInterval(() => {
if (exitOnNextTick === true) {
console.log(`\nFinished indexing ${recordsIndexed} records.`);
process.exit();
}
if (recordsIndexed >= documents.length) exitOnNextTick = true;
}, 500);
}
try {
documents = JSON.parse(fs.readFileSync(pathToJson, "utf8"));
} catch (e) {
console.error(e);
process.exit(1);
}
console.log(`About to process ${documents.length} documents`);
indexDocumentsInBatches(documents);
@haynesgt
Copy link

haynesgt commented Sep 2, 2021

Note that this breaks when the file is too large, so I've created this adaptation for multi-gigabyte ndjson

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment