Skip to content

Instantly share code, notes, and snippets.

@Kerollmops
Last active December 15, 2023 16:29
Show Gist options
  • Save Kerollmops/d0e434016b1a698ec9bdbd86d268885e to your computer and use it in GitHub Desktop.
Save Kerollmops/d0e434016b1a698ec9bdbd86d268885e to your computer and use it in GitHub Desktop.
Using Spotify/Annoy to index some vectors in parallel
/*
* par_build_tree.cpp
*
* Move this in the examples folder of Annoy and compile it like the other c++ example.
*
* Created on: Dec 6, 2023
* Author: Clément Renault
*/
#include <iostream>
#include "../src/kissrandom.h"
#include "../src/annoylib.h"
#include <chrono>
#include <iostream>
#include <vector>
#include <string>
#include <sstream>
#include <charconv>
using namespace Annoy;
// The file look like that
// === BEGIN vectors ===
// 0, [0.010056925, -0.0045358953, 0.009904552, 0.0046241777, ..., -0.050245073]
// === END vectors ===
std::pair<int, std::vector<double>> parseString(const std::string& input) {
std::pair<int, std::vector<double>> result;
// Find the position of the opening and closing brackets
size_t openBracketPos = input.find('[');
size_t closeBracketPos = input.find(']');
if (openBracketPos == std::string::npos || closeBracketPos == std::string::npos) {
// Brackets not found, return an empty result
return result;
}
// Extract the id from the portion before the comma
std::string idStr = input.substr(0, openBracketPos);
if (auto [p, ec] = std::from_chars(idStr.data(), idStr.data() + idStr.size(), result.first); ec != std::errc()) {
// Parsing the id failed, return an empty result
return result;
}
// Extract the numbers within brackets
std::string numbersStr = input.substr(openBracketPos + 1, closeBracketPos - openBracketPos - 1);
std::istringstream iss(numbersStr);
double number;
while (iss >> number) {
result.second.push_back(number);
iss.ignore(1); // Ignore the comma or space
}
// The vector numbers are properly constructed before returning
return result;
}
int main(int argc, char **argv) {
std::chrono::high_resolution_clock::time_point t_start, t_end;
int f = 768;
// Building the tree
AnnoyIndex<int, double, DotProduct, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy> t = AnnoyIndex<int, double, DotProduct, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy>(f);
std::cout << "Building index ... be patient !!" << std::endl;
std::string line;
while (std::getline(std::cin, line)) {
auto [id, numbers] = parseString(line);
if (!numbers.empty()) {
std::vector<double> vec(numbers.begin(), numbers.end());
t.add_item(id, vec.data());
} else {
std::cout << "Invalid input" << std::endl;
}
}
std::cout << std::endl;
std::cout << "Building index num_trees = 2 * num_features ...";
t_start = std::chrono::high_resolution_clock::now();
t.build(200); // automatically uses all threads
t_end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>( t_end - t_start ).count();
std::cout << " Done in "<< duration << " secs." << std::endl;
std::cout << "NItems " << t.get_n_items() << std::endl;
std::cout << "NTrees " << t.get_n_trees() << std::endl;
std::cout << "Saving index ...";
t.save("precision.tree");
std::cout << " Done" << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment