Last active
August 11, 2023 00:10
-
-
Save benwtrent/be084f4737f79dab7204eca3e6ab98fe to your computer and use it in GitHub Desktop.
Code used to create distributed data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env/python | |
import os | |
import subprocess | |
import benchUtil | |
import constants | |
LUCENE_CHECKOUT = 'lucene_candidate' | |
# test parameters. This script will run KnnGraphTester on every combination of these parameters | |
VALUES = { | |
'ndoc': (100000,),# 200000), | |
'maxConn': (48, ), | |
'beamWidthIndex': (200,), | |
'fanout': (0, 10, 50, 90, 190, 490, 590, 990),#, 250), | |
'topK': (10,), | |
} | |
def advance(ix, values): | |
for i in reversed(range(len(ix))): | |
param = list(values.keys())[i] | |
if ix[i] == len(values[param]) - 1: | |
ix[i] = 0 | |
else: | |
ix[i] += 1 | |
return True | |
return False | |
def run_knn_benchmark(checkout, values, training_file, testing_file, dims, metric): | |
indexes = [0] * len(values.keys()) | |
indexes[-1] = -1 | |
args = [] | |
print(f"\n\n\nNow running{training_file}\n\n\n") | |
dim = dims #768 | |
doc_vectors = training_file # '%s/util/wiki768ja.random.train' % constants.BASE_DIR #constants.GLOVE_VECTOR_DOCS_FILE | |
query_vectors = testing_file # '%s/util/wiki768ja.test' % constants.BASE_DIR #'%s/util/tasks/vector-task-100d.vec' % constants.BASE_DIR | |
cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout)) | |
JAVA_EXE = '/Users/benjamintrent/Library/Java/JavaVirtualMachines/jdk-20.0.1.jdk/Contents/Home/bin/java' | |
cmd = [JAVA_EXE, | |
'-cp', cp, | |
'--add-modules', 'jdk.incubator.vector', | |
'-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false', | |
'KnnGraphTester'] | |
print("recall\tlatency\tnDoc\tfanout\tmaxConn\tbeamWidth\tvisited\tindex ms") | |
while advance(indexes, values): | |
pv = {} | |
args = [] | |
for (i, p) in enumerate(list(values.keys())): | |
if p in values: | |
if values[p]: | |
value = values[p][indexes[i]] | |
pv[p] = value | |
else: | |
args += ['-' + p] | |
args += [a for (k, v) in pv.items() for a in ('-' + k, str(v)) if a] | |
this_cmd = cmd + args + [ | |
'-dim', str(dim), | |
'-docs', doc_vectors, | |
#'-stats' | |
#'-reindex', | |
'-metric', metric, | |
'-search', query_vectors, | |
#'-forceMerge', | |
#'-niter', str(3000), | |
'-quiet', | |
] | |
subprocess.run(this_cmd) | |
test_names = ["normal_1_1", "normal_1_2", "pareto", "uniform", "bimodal_5", "bimodal_9", "gamma_1_1", "gamma_2_2"] | |
tests = [] | |
for name in test_names: | |
tests.append((f"{constants.BASE_DIR}/util/{name}.ordered.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular")) | |
tests.append((f"{constants.BASE_DIR}/util/{name}.random.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular")) | |
tests.append((f"{constants.BASE_DIR}/util/{name}.reversed.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular")) | |
tests.append((f"{constants.BASE_DIR}/util/{name}.ordered-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean")) | |
tests.append((f"{constants.BASE_DIR}/util/{name}.random-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean")) | |
tests.append((f"{constants.BASE_DIR}/util/{name}.reversed-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean")) | |
for (training_file, testing_file, dims, metric) in tests: | |
run_knn_benchmark(LUCENE_CHECKOUT, VALUES, training_file, testing_file, dims, metric) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# Load the data | |
table = np.load('data/embeddings.npy') | |
# check that all vectors are unit vectors | |
assert np.allclose(np.linalg.norm(table, axis=1), 1) | |
uniform = (np.random.uniform(0, 1, table.shape) * 10) * table | |
# linearly scale all vector magnitudes via a normal distribution | |
normal_1 = (np.random.normal(loc=1.0, scale=0.1, size=table.shape) * 10) * table | |
normal_2 = (np.random.normal(loc=1.0, scale=0.2, size=table.shape) * 10) * table | |
# scale all vector magnitudes via a gamma distribution | |
gamma_1 = (np.random.gamma(1, 1, table.shape) * 10) * table | |
gamma_2 = (np.random.gamma(2, 2, table.shape) * 10) * table | |
# scale all vector magnitudes via pareto distribution | |
pareto_5 = (np.random.pareto(5, table.shape) * 10) * table | |
bimodal_9 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.9) + (np.random.normal(loc=3, scale=0.2, size=table.shape)*0.1))*10) * table | |
bimodal_5 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.5) + (np.random.normal(loc=3, scale=0.2, size=table.shape)*0.5))*10) * table | |
def save_to_file(filename, dataset): | |
np.save(filename, dataset.astype(np.float32), allow_pickle=False) | |
for ds in [(uniform, "uniform"), (normal_1, "normal_1_1"), (normal_2, "normal_1_2"), (gamma_1, "gamma_1_1"), (gamma_2, "gamma_2_2"), (pareto_5, "pareto"), (bimodal_9, "bimodal_9"), (bimodal_5, "bimodal_5")]: | |
save_to_file("data/{}.npy".format(ds[1]), ds[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
DATA_SETS =[ | |
{"name": "uniform", "files": [ | |
"uniform.npy", | |
]}, | |
{"name": "normal_1_1", "files": [ | |
"normal_1_1.npy", | |
]}, | |
{"name": "normal_1_2", "files": [ | |
"normal_1_2.npy", | |
]}, | |
{"name": "gamma_1_1", "files": [ | |
"gamma_1_1.npy", | |
]}, | |
{"name": "gamma_2_2", "files": [ | |
"gamma_2_2.npy", | |
]}, | |
{"name": "pareto", "files": [ | |
"pareto.npy", | |
]}, | |
{"name": "bimodal_9", "files": [ | |
"bimodal_9.npy", | |
]}, | |
{"name": "bimodal_5", "files": [ | |
"bimodal_5.npy", | |
]}, | |
] | |
def transform_queries(Q): | |
n, _ = Q.shape | |
return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32) | |
def transform_docs(D, norms): | |
n, d = D.shape | |
max_norm = magnitudes.max() | |
flipped_norms = np.copy(norms).reshape(n, 1) | |
transformed_data = np.concatenate([D, np.sqrt(max_norm**2 - flipped_norms**2)], axis=-1, dtype=np.float32) | |
return transformed_data | |
def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto): | |
assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different" | |
def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto): | |
n1, d1 = arr1.shape | |
n2, d2 = arr2.shape | |
assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]" | |
for i in range(n1): | |
validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto) | |
for data_set in DATA_SETS: | |
name = data_set["name"] | |
np_total = np.load(data_set["files"][0]) | |
assert np_total.shape == (522931, 384) | |
assert np_total.dtype == np.float32 | |
assert np.isnan(np_total).sum() == 0 | |
#Have to convert to a list here to get | |
#the numpy ndarray's shape correct later | |
#There's probably a better way... | |
flat_ds = list() | |
for vec in np_total: | |
flat_ds.append(vec) | |
np_flat_ds = np.array(flat_ds) | |
assert np_total.shape == (522931, 384) | |
assert np_total.dtype == np.float32 | |
assert np.isnan(np_flat_ds).sum() == 0 | |
row_count = np_flat_ds.shape[0] | |
query_count = 10_000 | |
training_rows = row_count - query_count | |
print(f"{name} num rows: {training_rows}") | |
transformed_queries = transform_queries(np_flat_ds[training_rows:-1]) | |
validate_dataset_match_upto_dim(transformed_queries, np_flat_ds[training_rows:-1], 384) | |
with open(f"{name}-transform.test", "w") as out_f: | |
transformed_queries.tofile(out_f) | |
with open(f"{name}.test", "w") as out_f: | |
np_flat_ds[training_rows:-1].tofile(out_f) | |
magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1) | |
print("mean median var max min") | |
print(f"{np.mean(magnitudes)} {np.median(magnitudes)} {np.var(magnitudes)} {np.max(magnitudes)} {np.min(magnitudes)}") | |
indices = np.argsort(magnitudes) | |
transformed_np_flat_ds = transform_docs(np_flat_ds[0:training_rows], magnitudes) | |
validate_dataset_match_upto_dim(transformed_np_flat_ds, np_flat_ds[0:training_rows], 384) | |
transformed_np_flat_ds_sorted = transformed_np_flat_ds[indices] | |
np_flat_ds_sorted = np_flat_ds[indices] | |
with open(f"{name}.random-transform.train", "w") as out_f: | |
transformed_np_flat_ds.tofile(out_f) | |
with open(f"{name}.ordered-transform.train", "w") as out_f: | |
transformed_np_flat_ds_sorted.tofile(out_f) | |
with open(f"{name}.reversed-transform.train", "w") as out_f: | |
np.flip(transformed_np_flat_ds_sorted, axis=0).tofile(out_f) | |
with open(f"{name}.random.train", "w") as out_f: | |
np_flat_ds[0:training_rows].tofile(out_f) | |
with open(f"{name}.reversed.train", "w") as out_f: | |
np.flip(np_flat_ds_sorted, axis=0).tofile(out_f) | |
with open(f"{name}.ordered.train", "w") as out_f: | |
np_flat_ds_sorted.tofile(out_f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment