Skip to content

Instantly share code, notes, and snippets.

@benwtrent
Last active August 11, 2023 00:10
Show Gist options
  • Save benwtrent/be084f4737f79dab7204eca3e6ab98fe to your computer and use it in GitHub Desktop.
Save benwtrent/be084f4737f79dab7204eca3e6ab98fe to your computer and use it in GitHub Desktop.
Code used to create distributed data
#!/usr/bin/env/python
import os
import subprocess
import benchUtil
import constants
LUCENE_CHECKOUT = 'lucene_candidate'
# test parameters. This script will run KnnGraphTester on every combination of these parameters
VALUES = {
'ndoc': (100000,),# 200000),
'maxConn': (48, ),
'beamWidthIndex': (200,),
'fanout': (0, 10, 50, 90, 190, 490, 590, 990),#, 250),
'topK': (10,),
}
def advance(ix, values):
for i in reversed(range(len(ix))):
param = list(values.keys())[i]
if ix[i] == len(values[param]) - 1:
ix[i] = 0
else:
ix[i] += 1
return True
return False
def run_knn_benchmark(checkout, values, training_file, testing_file, dims, metric):
indexes = [0] * len(values.keys())
indexes[-1] = -1
args = []
print(f"\n\n\nNow running{training_file}\n\n\n")
dim = dims #768
doc_vectors = training_file # '%s/util/wiki768ja.random.train' % constants.BASE_DIR #constants.GLOVE_VECTOR_DOCS_FILE
query_vectors = testing_file # '%s/util/wiki768ja.test' % constants.BASE_DIR #'%s/util/tasks/vector-task-100d.vec' % constants.BASE_DIR
cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout))
JAVA_EXE = '/Users/benjamintrent/Library/Java/JavaVirtualMachines/jdk-20.0.1.jdk/Contents/Home/bin/java'
cmd = [JAVA_EXE,
'-cp', cp,
'--add-modules', 'jdk.incubator.vector',
'-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false',
'KnnGraphTester']
print("recall\tlatency\tnDoc\tfanout\tmaxConn\tbeamWidth\tvisited\tindex ms")
while advance(indexes, values):
pv = {}
args = []
for (i, p) in enumerate(list(values.keys())):
if p in values:
if values[p]:
value = values[p][indexes[i]]
pv[p] = value
else:
args += ['-' + p]
args += [a for (k, v) in pv.items() for a in ('-' + k, str(v)) if a]
this_cmd = cmd + args + [
'-dim', str(dim),
'-docs', doc_vectors,
#'-stats'
#'-reindex',
'-metric', metric,
'-search', query_vectors,
#'-forceMerge',
#'-niter', str(3000),
'-quiet',
]
subprocess.run(this_cmd)
test_names = ["normal_1_1", "normal_1_2", "pareto", "uniform", "bimodal_5", "bimodal_9", "gamma_1_1", "gamma_2_2"]
tests = []
for name in test_names:
tests.append((f"{constants.BASE_DIR}/util/{name}.ordered.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
tests.append((f"{constants.BASE_DIR}/util/{name}.random.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
tests.append((f"{constants.BASE_DIR}/util/{name}.reversed.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
tests.append((f"{constants.BASE_DIR}/util/{name}.ordered-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
tests.append((f"{constants.BASE_DIR}/util/{name}.random-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
tests.append((f"{constants.BASE_DIR}/util/{name}.reversed-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
for (training_file, testing_file, dims, metric) in tests:
run_knn_benchmark(LUCENE_CHECKOUT, VALUES, training_file, testing_file, dims, metric)
import numpy as np
# Load the data
table = np.load('data/embeddings.npy')
# check that all vectors are unit vectors
assert np.allclose(np.linalg.norm(table, axis=1), 1)
uniform = (np.random.uniform(0, 1, table.shape) * 10) * table
# linearly scale all vector magnitudes via a normal distribution
normal_1 = (np.random.normal(loc=1.0, scale=0.1, size=table.shape) * 10) * table
normal_2 = (np.random.normal(loc=1.0, scale=0.2, size=table.shape) * 10) * table
# scale all vector magnitudes via a gamma distribution
gamma_1 = (np.random.gamma(1, 1, table.shape) * 10) * table
gamma_2 = (np.random.gamma(2, 2, table.shape) * 10) * table
# scale all vector magnitudes via pareto distribution
pareto_5 = (np.random.pareto(5, table.shape) * 10) * table
bimodal_9 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.9) + (np.random.normal(loc=3, scale=0.2, size=table.shape)*0.1))*10) * table
bimodal_5 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.5) + (np.random.normal(loc=3, scale=0.2, size=table.shape)*0.5))*10) * table
def save_to_file(filename, dataset):
np.save(filename, dataset.astype(np.float32), allow_pickle=False)
for ds in [(uniform, "uniform"), (normal_1, "normal_1_1"), (normal_2, "normal_1_2"), (gamma_1, "gamma_1_1"), (gamma_2, "gamma_2_2"), (pareto_5, "pareto"), (bimodal_9, "bimodal_9"), (bimodal_5, "bimodal_5")]:
save_to_file("data/{}.npy".format(ds[1]), ds[0])
import numpy as np
DATA_SETS =[
{"name": "uniform", "files": [
"uniform.npy",
]},
{"name": "normal_1_1", "files": [
"normal_1_1.npy",
]},
{"name": "normal_1_2", "files": [
"normal_1_2.npy",
]},
{"name": "gamma_1_1", "files": [
"gamma_1_1.npy",
]},
{"name": "gamma_2_2", "files": [
"gamma_2_2.npy",
]},
{"name": "pareto", "files": [
"pareto.npy",
]},
{"name": "bimodal_9", "files": [
"bimodal_9.npy",
]},
{"name": "bimodal_5", "files": [
"bimodal_5.npy",
]},
]
def transform_queries(Q):
n, _ = Q.shape
return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32)
def transform_docs(D, norms):
n, d = D.shape
max_norm = magnitudes.max()
flipped_norms = np.copy(norms).reshape(n, 1)
transformed_data = np.concatenate([D, np.sqrt(max_norm**2 - flipped_norms**2)], axis=-1, dtype=np.float32)
return transformed_data
def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto):
assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different"
def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto):
n1, d1 = arr1.shape
n2, d2 = arr2.shape
assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]"
for i in range(n1):
validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto)
for data_set in DATA_SETS:
name = data_set["name"]
np_total = np.load(data_set["files"][0])
assert np_total.shape == (522931, 384)
assert np_total.dtype == np.float32
assert np.isnan(np_total).sum() == 0
#Have to convert to a list here to get
#the numpy ndarray's shape correct later
#There's probably a better way...
flat_ds = list()
for vec in np_total:
flat_ds.append(vec)
np_flat_ds = np.array(flat_ds)
assert np_total.shape == (522931, 384)
assert np_total.dtype == np.float32
assert np.isnan(np_flat_ds).sum() == 0
row_count = np_flat_ds.shape[0]
query_count = 10_000
training_rows = row_count - query_count
print(f"{name} num rows: {training_rows}")
transformed_queries = transform_queries(np_flat_ds[training_rows:-1])
validate_dataset_match_upto_dim(transformed_queries, np_flat_ds[training_rows:-1], 384)
with open(f"{name}-transform.test", "w") as out_f:
transformed_queries.tofile(out_f)
with open(f"{name}.test", "w") as out_f:
np_flat_ds[training_rows:-1].tofile(out_f)
magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1)
print("mean median var max min")
print(f"{np.mean(magnitudes)} {np.median(magnitudes)} {np.var(magnitudes)} {np.max(magnitudes)} {np.min(magnitudes)}")
indices = np.argsort(magnitudes)
transformed_np_flat_ds = transform_docs(np_flat_ds[0:training_rows], magnitudes)
validate_dataset_match_upto_dim(transformed_np_flat_ds, np_flat_ds[0:training_rows], 384)
transformed_np_flat_ds_sorted = transformed_np_flat_ds[indices]
np_flat_ds_sorted = np_flat_ds[indices]
with open(f"{name}.random-transform.train", "w") as out_f:
transformed_np_flat_ds.tofile(out_f)
with open(f"{name}.ordered-transform.train", "w") as out_f:
transformed_np_flat_ds_sorted.tofile(out_f)
with open(f"{name}.reversed-transform.train", "w") as out_f:
np.flip(transformed_np_flat_ds_sorted, axis=0).tofile(out_f)
with open(f"{name}.random.train", "w") as out_f:
np_flat_ds[0:training_rows].tofile(out_f)
with open(f"{name}.reversed.train", "w") as out_f:
np.flip(np_flat_ds_sorted, axis=0).tofile(out_f)
with open(f"{name}.ordered.train", "w") as out_f:
np_flat_ds_sorted.tofile(out_f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment