Skip to content

Instantly share code, notes, and snippets.

@hweller1
Created October 9, 2024 18:08
Show Gist options
  • Save hweller1/a9623f7872591f59cce93d4c1ee0bd6f to your computer and use it in GitHub Desktop.
Save hweller1/a9623f7872591f59cce93d4c1ee0bd6f to your computer and use it in GitHub Desktop.
user space SQ of openAI text-embedding-ada-002 (indexing + querying)
# deps
# %pip install pymongo openai numpy
import numpy as np
import os
os.environ["OPENAI_API_KEY"] = '<my-openai-key>'
import pymongo
import openai
from bson.binary import BinaryVectorDtype, Binary
from pymongo.operations import SearchIndexModel
sentences = [
"Water boils at 100 degrees Celsius at standard atmospheric pressure.",
"The Great Wall of China is visible from space.",
"Photosynthesis converts light energy into chemical energy in plants.",
"DNA contains the genetic instructions for all living organisms.",
"Jupiter is the largest planet in our solar system.",
"The Eiffel Tower was completed in Paris in 1889.",
"Honey never spoils due to its unique chemical properties.",
"The human body contains approximately 206 bones in adulthood.",
"Mount Everest is the highest peak on Earth at 8,848m.",
"Shakespeare wrote 37 plays and 154 sonnets during his lifetime.",
"The speed of light in vacuum is 299,792,458 meters/second.",
"Penguins are flightless birds found primarily in the Southern Hemisphere.",
"The Mona Lisa was painted by Leonardo da Vinci.",
"Oxygen makes up about 21% of Earth's atmosphere by volume.",
"The American Civil War lasted from 1861 to 1865.",
"Antibiotics are ineffective against viral infections like the common cold.",
"The human heart beats approximately 100,000 times per day.",
"Gold is one of the least reactive chemical elements.",
"The first successful powered aircraft flew in 1903.",
"There are 118 elements in the periodic table."
] * 10
conn_str = "<my-connection-str>"
client = pymongo.MongoClient(conn_str)
db_name = "bsontestdb"
collection_name = "bsontestcol_openai"
openai_embeddings = openai.embeddings.create(
model= "text-embedding-ada-002",
input=sentences
).data
def quantize_32bit_to_8bit(vector):
# Ensure the input is a NumPy array
vector = np.array(vector, dtype=np.float32)
# Step 1: Normalize to [-1, 1]
max_abs = np.max(np.abs(vector))
normalized = vector / max_abs if max_abs != 0 else vector
# Step 2: Scale to [-128, 127]
scaled = normalized * 127.5
# Step 3: Round to nearest integer
rounded = np.round(scaled)
# Step 4: Clip values to ensure they're in the range [-128, 127]
clipped = np.clip(rounded, -128, 127)
# Step 5: Convert to int8
quantized = clipped.astype(np.int8)
return quantized
def generate_bson_vector_with_pymongo(vector, data_type: BinaryVectorDtype):
return Binary.from_vector(vector, data_type)
# 1. INGEST + INDEX
for vector in openai_embeddings:
quantized_vector = quantize_32bit_to_8bit(vector.embedding)
bson_vector = generate_bson_vector_with_pymongo(quantized_vector, BinaryVectorDtype.INT8)
col.insert_one({"int8_openai_vector": bson_vector})
print("Original vector:", vector.embedding)
print("Quantized vector:", quantized_vector)
print("BSON vector:", bson_vector)
search_index_model = SearchIndexModel(
definition={
"fields":
[{"type": "vector",
"path": "int8_openai_vector",
"numDimensions": 1536,
"similarity": "dotProduct"
}
]
},
name="myOpenAIbinDataIndex",
type="vectorSearch",
)
result = col.create_search_index(model=search_index_model)
print(result)
# 2. QUERY
def run_vector_search(query_bson_vector, col, index_name, num_candidates=10, limit=10):
results = {}
pipe = [{'$vectorSearch': {
"index": index_name,
"path": "int8_openai_vector",
"numCandidates": num_candidates,
"limit": limit,
"queryVector": query_bson_vector,
}
},
{
"$project": {
"_id": 1,
"data": 1,
"score": { "$meta": "vectorSearchScore" }
}
}
]
cursor = col.aggregate(pipe)
results = list(cursor)
return results
query_vector = openai.embeddings.create(
model= "text-embedding-ada-002",
input=["this is a query"]
).data[0].embedding
quantized_vector = quantize_32bit_to_8bit(query_vector)
bindata_query_vector = generate_bson_vector_with_pymongo(quantized_vector, BinaryVectorDtype.INT8)
sentences_query_results = run_vector_search(bindata_query_vector, col, "myOpenAIbinDataIndex")
print(sentences_query_results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment