JasonStoltz · April 27, 2022 13:25 · haynesgt · Sep 2, 2021
diff --git a/app-search-cli b/app-search-cli
 #!/usr/bin/env node

 /**
 * Install:
 *
 * Download this file and give it executuable persmissions
 *
 * Usage:
 *
 * Uploading documents
 * ---------
 *
 * app-search-cli upload ~/Documents/pokemon.json http://localhost:3002 test-engine private-kayik65qbd516q1brk724aaa
 *
 * Concurrency and batch size can be adjusted manually in code below
 */
 const fs = require("fs");

 const [command, pathToJson, apiPath, engineName, apiKey] = process.argv.slice(
  2
 );
 console.log("command: ", command);
 console.log("pathToJson: ", pathToJson);
 console.log("apiPath: ", apiPath);
 console.log("engineName: ", engineName);
 console.log("apiKey: ", apiKey);

 const apiBase = `${apiPath}/api/as/v1`;
 const documentsUrl = `${apiBase}/engines/${engineName}/documents`;

 // Adapted from: https://www.tomas-dvorak.cz/posts/nodejs-request-without-dependencies/
 const request = function(url, options = {}, requestBody = "") {
  // return new pending promise
  return new Promise((resolve, reject) => {
    // select http or https module, depending on reqested url
    const lib = url.startsWith("https") ? require("https") : require("http");
    const request = lib.request(url, options, response => {
      // temporary data holder
      const body = [];
      // on every content chunk, push it to the data array
      response.on("data", chunk => body.push(chunk));
      // we are done, resolve promise with those joined chunks
      response.on("end", () => {
        // handle http errors
        if (response.statusCode < 200 || response.statusCode > 299) {
          reject(
            new Error(`API Error: ${response.statusCode} ${body.join(" ")}`)
          );
        }
        resolve(body.join(""));
      });
    });

    // handle connection errors of the request
    request.on("error", err => reject(err));
    request.write(requestBody);
    request.end();
  });
 };

 /*
  Documents can only be indexed 100 at a time, so we index
  our data set in batches. The following is a very simple batching function, which
  allows for has a configurable `concurrency` variable, which allows for faster
  indexing.
 */
 function indexDocumentsInBatches(documents) {
  debugger;
  const concurrency = 20;
  const size = 100;
  let start = 0;
  let end = start + size;
  let recordsIndexed = 0;

  function nextBatch() {
    if (start > documents.length) return [];
    console.log(`processing batch ${start} to ${end}`);
    const batch = documents.slice(start, end);
    start += size;
    end += size;
    return batch;
  }

  async function batchChain() {
    let batch = nextBatch();
    const batchString = JSON.stringify(batch);
    if (batch.length === 0) return;
    try {
      await request(
        documentsUrl,
        {
          method: "POST",
          headers: {
            Authorization: `Bearer ${apiKey}`,
            "Content-Type": "application/json",
            "Content-Length": Buffer.byteLength(batchString)
          }
        },
        batchString
      );
      recordsIndexed += batch.length;
      if (start < documents.length) {
        return batchChain();
      }
    } catch (e) {
      console.log(e);
      process.exit();
    }
  }

  for (let i = 0; i < concurrency; i++) {
    batchChain();
  }

  let exitOnNextTick = false;
  setInterval(() => {
    if (exitOnNextTick === true) {
      console.log(`\nFinished indexing ${recordsIndexed} records.`);
      process.exit();
    }
    if (recordsIndexed >= documents.length) exitOnNextTick = true;
  }, 500);
 }

 try {
  documents = JSON.parse(fs.readFileSync(pathToJson, "utf8"));
 } catch (e) {
  console.error(e);
  process.exit(1);
 }

 console.log(`About to process ${documents.length} documents`);

 indexDocumentsInBatches(documents);
	#!/usr/bin/env node

	/**
	* Install:
	*
	* Download this file and give it executuable persmissions
	*
	* Usage:
	*
	* Uploading documents
	* ---------
	*
	* app-search-cli upload ~/Documents/pokemon.json http://localhost:3002 test-engine private-kayik65qbd516q1brk724aaa
	*
	* Concurrency and batch size can be adjusted manually in code below
	*/
	const fs = require("fs");

	const [command, pathToJson, apiPath, engineName, apiKey] = process.argv.slice(
	2
	);
	console.log("command: ", command);
	console.log("pathToJson: ", pathToJson);
	console.log("apiPath: ", apiPath);
	console.log("engineName: ", engineName);
	console.log("apiKey: ", apiKey);

	const apiBase = `${apiPath}/api/as/v1`;
	const documentsUrl = `${apiBase}/engines/${engineName}/documents`;

	// Adapted from: https://www.tomas-dvorak.cz/posts/nodejs-request-without-dependencies/
	const request = function(url, options = {}, requestBody = "") {
	// return new pending promise
	return new Promise((resolve, reject) => {
	// select http or https module, depending on reqested url
	const lib = url.startsWith("https") ? require("https") : require("http");
	const request = lib.request(url, options, response => {
	// temporary data holder
	const body = [];
	// on every content chunk, push it to the data array
	response.on("data", chunk => body.push(chunk));
	// we are done, resolve promise with those joined chunks
	response.on("end", () => {
	// handle http errors
	if (response.statusCode < 200 \|\| response.statusCode > 299) {
	reject(
	new Error(`API Error: ${response.statusCode} ${body.join(" ")}`)
	);
	}
	resolve(body.join(""));
	});
	});

	// handle connection errors of the request
	request.on("error", err => reject(err));
	request.write(requestBody);
	request.end();
	});
	};

	/*
	Documents can only be indexed 100 at a time, so we index
	our data set in batches. The following is a very simple batching function, which
	allows for has a configurable `concurrency` variable, which allows for faster
	indexing.
	*/
	function indexDocumentsInBatches(documents) {
	debugger;
	const concurrency = 20;
	const size = 100;
	let start = 0;
	let end = start + size;
	let recordsIndexed = 0;

	function nextBatch() {
	if (start > documents.length) return [];
	console.log(`processing batch ${start} to ${end}`);
	const batch = documents.slice(start, end);
	start += size;
	end += size;
	return batch;
	}

	async function batchChain() {
	let batch = nextBatch();
	const batchString = JSON.stringify(batch);
	if (batch.length === 0) return;
	try {
	await request(
	documentsUrl,
	{
	method: "POST",
	headers: {
	Authorization: `Bearer ${apiKey}`,
	"Content-Type": "application/json",
	"Content-Length": Buffer.byteLength(batchString)
	}
	},
	batchString
	);
	recordsIndexed += batch.length;
	if (start < documents.length) {
	return batchChain();
	}
	} catch (e) {
	console.log(e);
	process.exit();
	}
	}

	for (let i = 0; i < concurrency; i++) {
	batchChain();
	}

	let exitOnNextTick = false;
	setInterval(() => {
	if (exitOnNextTick === true) {
	console.log(`\nFinished indexing ${recordsIndexed} records.`);
	process.exit();
	}
	if (recordsIndexed >= documents.length) exitOnNextTick = true;
	}, 500);
	}

	try {
	documents = JSON.parse(fs.readFileSync(pathToJson, "utf8"));
	} catch (e) {
	console.error(e);
	process.exit(1);
	}

	console.log(`About to process ${documents.length} documents`);

	indexDocumentsInBatches(documents);