Skip to content

Instantly share code, notes, and snippets.

View benwtrent's full-sized avatar
🏠
Working from home

Benjamin Trent benwtrent

🏠
Working from home
View GitHub Profile
package org.apache.pylucene.codecs;
import org.apache.lucene.codecs.lucene100.Lucene100Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
public class PyLucene100Codec extends Lucene100Codec {
private long pythonObject;
public void pythonExtension(long pythonObject){
this.pythonObject = pythonObject;
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/
package org.elasticsearch.benchmark.vector;
@benwtrent
benwtrent / msmarco-dataset
Created October 14, 2024 11:19
msmarco numbers
------------------------------------------------------
_______ __ _____
/ ____(_)___ ____ _/ / / ___/_________ ________
/ /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \
/ __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/
/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/
------------------------------------------------------
2024-10-14 08:28:58,549 ActorAddr-(T|:35387)/PID:7662 esrally.reporter INFO | Metric | Task | Value | Unit |
|---------------------------------------------------------------:|---------------------------------------------------:|----------------:|-------:|
@benwtrent
benwtrent / dense_vector_hnsw.txt
Created October 10, 2024 16:57
bbq local runs
------------------------------------------------------
_______ __ _____
/ ____(_)___ ____ _/ / / ___/_________ ________
/ /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \
/ __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/
/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/
------------------------------------------------------
| Metric | Task | Baseline | Contender | Diff | Unit | Diff % |
|--------------------------------------------------------------:|---------------------------------------------:|---------------:|-----------------:|--------------:|-------:|---------:|
@benwtrent
benwtrent / RecallTest.java
Created September 18, 2024 19:06
tools for reading and testing vector files
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
@benwtrent
benwtrent / test-bq-quantization.ipynb
Created July 19, 2024 20:02
simple notebook for vector testing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@benwtrent
benwtrent / data_load_and_encode.py
Created February 16, 2024 16:09
Stupid binary encoding tests
import numpy as np
import pyarrow.parquet as pq
from sklearn.neighbors import NearestNeighbors
# load data/%d-en.parquet files into a single numpy metrix
# vector dimensions are 1024
# load data
tbls = []
for i in range(10):
@benwtrent
benwtrent / cohere_data.py
Last active January 31, 2024 15:26
download and format cohere data
import pyarrow.parquet as pq
import numpy as np
DATA_SETS =[
{"name": "wiki768", "files": [
"train-00000-of-00004-1a1932c9ca1c7152.parquet",
"train-00001-of-00004-f4a4f5540ade14b4.parquet",
"train-00002-of-00004-ff770df3ab420d14.parquet",
"train-00003-of-00004-85b3dbbc960e92ec.parquet",
@benwtrent
benwtrent / knnPerf.py
Last active August 11, 2023 00:10
Code used to create distributed data
#!/usr/bin/env/python
import os
import subprocess
import benchUtil
import constants
LUCENE_CHECKOUT = 'lucene_candidate'
# test parameters. This script will run KnnGraphTester on every combination of these parameters
@benwtrent
benwtrent / ByteBufferFloatDecodeLatencyBenchmark.java
Last active June 8, 2023 14:51
Decoding ByteBuffers into Floats microbenchmark
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;