Last active
April 8, 2020 06:38
-
-
Save longlongh4/4d8c53fbbc4f47a99d060ee39c34bb99 to your computer and use it in GitHub Desktop.
A demo to use FAISS to build indexes for Hamming Distance based fingerprints.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <vector> | |
#include <fstream> | |
#include <sstream> | |
#include <faiss/IndexBinaryIVF.h> | |
#include <faiss/AutoTune.h> | |
#include <faiss/IndexBinaryFlat.h> | |
using namespace std; | |
const string hashPath = "/home/hailong/labs/ads/hash.txt"; | |
/**** | |
dataset example: | |
37526101-d42e-416f-956d-31744915e0e4:2702284295253482914,1961827062776793808,15338216169055532216,1277173557604361546,323801233997848268,6859946266805363904,5389844540866461070,5463698079089396960,12707568597411666992,322984150776704616,5156804278819165924,7653749133710948762,35570857859768012,12088489394757528714,8796776315060649254,18059698677919318550 | |
60277e94-b9a1-4539-ba42-7a8431caef18:18393304965971490470,6748912653283961598,14160794321219562102,12355755556287909842,12810266938956409008 | |
*******/ | |
class VideoHashes | |
{ | |
public: | |
VideoHashes(const string line, int64_t id); | |
// dont't use UUID here, because it is too big as a payload for indexing | |
int64_t videoID; | |
// uint64 needs to be represented in vector of uint8 to be inserted into FAISS | |
vector<uint8_t> hashes; | |
}; | |
VideoHashes::VideoHashes(const string line, int64_t id) | |
{ | |
videoID = id; | |
istringstream iss(line); | |
iss.ignore(38, ':'); | |
for (uint64_t hash; iss >> hash;) | |
{ | |
if (hash != 0) | |
{ | |
for (int i = 0; i < sizeof(uint64_t); i++) | |
{ | |
hashes.push_back(uint8_t(hash >> 8 * (7 - i) & 0xFF)); | |
} | |
} | |
if (iss.peek() == ',') | |
{ | |
iss.ignore(); | |
} | |
} | |
} | |
int main() | |
{ | |
vector<VideoHashes> videoHashesArray; | |
ifstream input(hashPath); | |
int64_t videoIndex = 1; | |
for (string line; getline(input, line);) | |
{ | |
videoHashesArray.push_back(VideoHashes(line, videoIndex++)); | |
} | |
printf("parsed %lu videos\n", videoHashesArray.size()); | |
vector<uint8_t> trainData; | |
for (VideoHashes video : videoHashesArray) | |
{ | |
trainData.insert(end(trainData), begin(video.hashes), end(video.hashes)); | |
} | |
// Dimension of the vectors | |
int d = 64; | |
// Initializing the quantizer. | |
faiss::IndexBinaryFlat quantizer(d); | |
// Number of clusters. | |
int nlist = 32; | |
// Initializing index. | |
faiss::IndexBinaryIVF index(&quantizer, d, nlist); | |
index.nprobe = 4; // Number of nearest clusters to be searched per query. | |
index.train(trainData.size() / 8, trainData.data()); | |
for (VideoHashes video : videoHashesArray) | |
{ | |
vector<faiss::Index::idx_t> labels(video.hashes.size() / 8, video.videoID); | |
index.add_with_ids(video.hashes.size() / 8, video.hashes.data(), labels.data()); | |
} | |
cout << "total fingerprints:" << index.ntotal << endl; | |
// How many neighbours to return for each fingerprint | |
int k = 10; | |
// how many fingerprints in the query(we can query with more than one fingerprint) | |
faiss::Index::idx_t n = videoHashesArray[0].hashes.size() / 8; | |
// use this vector to get the distance result | |
vector<int32_t> distance(k * n); | |
// use this vector to get the labels result | |
vector<faiss::Index::idx_t> labels(k * n); | |
index.search(n, videoHashesArray[0].hashes.data(), k, distance.data(), labels.data()); | |
for (int i = 0; i < n; i++) | |
{ | |
cout << "frame index:" << i << endl; | |
for (int j = 0; j < k; j++) | |
{ | |
cout << "distance:" << distance[i * 8 + j] << ',' << "labels:" << labels[i * 8 + j] << endl; | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment