Created
February 16, 2024 16:09
-
-
Save benwtrent/db37f3cae9732975c7df6d5451dbc475 to your computer and use it in GitHub Desktop.
Stupid binary encoding tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pyarrow.parquet as pq | |
from sklearn.neighbors import NearestNeighbors | |
# load data/%d-en.parquet files into a single numpy metrix | |
# vector dimensions are 1024 | |
# load data | |
tbls = [] | |
for i in range(10): | |
tbls.append(pq.read_table('data/%d-en.parquet' % i, columns=['emb'])) | |
np_total = np.concatenate([tbl[0].to_numpy() for tbl in tbls]) | |
flat_ds = list() | |
for vec in np_total: | |
flat_ds.append(vec) | |
np_flat_ds = np.array(flat_ds) | |
np_flat_ds.shape | |
doc_vectors = np_flat_ds[:-1000] | |
query_vectors = np_flat_ds[-1000:] | |
# get true 100 nearest neighbors for each query vector | |
knn = NearestNeighbors(n_neighbors=100, metric='cosine') | |
knn.fit(doc_vectors) | |
true_scores, true_neighbors = knn.kneighbors(query_vectors, return_distance=True) | |
# now binary quantize every vector, if dimension is > 0, set to 1 else 0 | |
# this is a very simple and fast way to reduce the size of the vectors | |
# and the number of operations needed to compute the distance | |
binary_doc_vectors = (doc_vectors > 0).astype('uint8') | |
binary_query_vectors = (query_vectors > 0).astype('uint8') | |
knn = NearestNeighbors(n_neighbors=100, metric='hamming') | |
knn.fit(binary_doc_vectors) | |
hamming_neighbors = knn.kneighbors(binary_query_vectors, return_distance=False) | |
# calculate the overlap between the true 100 nearest neighbors and the hamming nearest neighbors | |
# this is the recall of the method | |
overlap = np.array([len(np.intersect1d(true, hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)]) | |
#print('overlap', overlap) | |
# divide by 100 to get the recall | |
overlap = overlap / 100 | |
print('average overlap', overlap.mean()) | |
# now calculate the recall@50 hamming, i.e. how much overlap is there with the 100 nearest hamming distance to the 50 nearest true neighbors | |
# this should much much higher than the average overlap | |
overlap = np.array([len(np.intersect1d(true[:10], hamming)) for true, hamming in zip(true_neighbors, hamming_neighbors)]) | |
# divide by 50 to get the recall | |
overlap = overlap / 10 | |
print('overlap', overlap.mean()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
base_url="https://huggingface.co/api/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/parquet/en/train/" | |
for i in {0..10} | |
do | |
url="${base_url}${i}.parquet" | |
output_file="${i}-en.parquet" | |
echo "Downloading: $url" | |
curl -L "$url" -o "$output_file" & | |
done | |
wait |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment