Last active
December 15, 2023 16:29
-
-
Save Kerollmops/d0e434016b1a698ec9bdbd86d268885e to your computer and use it in GitHub Desktop.
Using Spotify/Annoy to index some vectors in parallel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* par_build_tree.cpp | |
* | |
* Move this in the examples folder of Annoy and compile it like the other c++ example. | |
* | |
* Created on: Dec 6, 2023 | |
* Author: Clément Renault | |
*/ | |
#include <iostream> | |
#include "../src/kissrandom.h" | |
#include "../src/annoylib.h" | |
#include <chrono> | |
#include <iostream> | |
#include <vector> | |
#include <string> | |
#include <sstream> | |
#include <charconv> | |
using namespace Annoy; | |
// The file look like that | |
// === BEGIN vectors === | |
// 0, [0.010056925, -0.0045358953, 0.009904552, 0.0046241777, ..., -0.050245073] | |
// === END vectors === | |
std::pair<int, std::vector<double>> parseString(const std::string& input) { | |
std::pair<int, std::vector<double>> result; | |
// Find the position of the opening and closing brackets | |
size_t openBracketPos = input.find('['); | |
size_t closeBracketPos = input.find(']'); | |
if (openBracketPos == std::string::npos || closeBracketPos == std::string::npos) { | |
// Brackets not found, return an empty result | |
return result; | |
} | |
// Extract the id from the portion before the comma | |
std::string idStr = input.substr(0, openBracketPos); | |
if (auto [p, ec] = std::from_chars(idStr.data(), idStr.data() + idStr.size(), result.first); ec != std::errc()) { | |
// Parsing the id failed, return an empty result | |
return result; | |
} | |
// Extract the numbers within brackets | |
std::string numbersStr = input.substr(openBracketPos + 1, closeBracketPos - openBracketPos - 1); | |
std::istringstream iss(numbersStr); | |
double number; | |
while (iss >> number) { | |
result.second.push_back(number); | |
iss.ignore(1); // Ignore the comma or space | |
} | |
// The vector numbers are properly constructed before returning | |
return result; | |
} | |
int main(int argc, char **argv) { | |
std::chrono::high_resolution_clock::time_point t_start, t_end; | |
int f = 768; | |
// Building the tree | |
AnnoyIndex<int, double, DotProduct, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy> t = AnnoyIndex<int, double, DotProduct, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy>(f); | |
std::cout << "Building index ... be patient !!" << std::endl; | |
std::string line; | |
while (std::getline(std::cin, line)) { | |
auto [id, numbers] = parseString(line); | |
if (!numbers.empty()) { | |
std::vector<double> vec(numbers.begin(), numbers.end()); | |
t.add_item(id, vec.data()); | |
} else { | |
std::cout << "Invalid input" << std::endl; | |
} | |
} | |
std::cout << std::endl; | |
std::cout << "Building index num_trees = 2 * num_features ..."; | |
t_start = std::chrono::high_resolution_clock::now(); | |
t.build(200); // automatically uses all threads | |
t_end = std::chrono::high_resolution_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::seconds>( t_end - t_start ).count(); | |
std::cout << " Done in "<< duration << " secs." << std::endl; | |
std::cout << "NItems " << t.get_n_items() << std::endl; | |
std::cout << "NTrees " << t.get_n_trees() << std::endl; | |
std::cout << "Saving index ..."; | |
t.save("precision.tree"); | |
std::cout << " Done" << std::endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment