hweller1 · October 9, 2024 18:08
diff --git a/scalar_quantize_openai.py b/scalar_quantize_openai.py
 # deps

 # %pip install pymongo openai numpy

 import numpy as np
 import os
 os.environ["OPENAI_API_KEY"] = '<my-openai-key>'
 import pymongo
 import openai

 from bson.binary import BinaryVectorDtype, Binary
 from pymongo.operations import SearchIndexModel

 sentences = [
    "Water boils at 100 degrees Celsius at standard atmospheric pressure.",
    "The Great Wall of China is visible from space.",
    "Photosynthesis converts light energy into chemical energy in plants.",
    "DNA contains the genetic instructions for all living organisms.",
    "Jupiter is the largest planet in our solar system.",
    "The Eiffel Tower was completed in Paris in 1889.",
    "Honey never spoils due to its unique chemical properties.",
    "The human body contains approximately 206 bones in adulthood.",
    "Mount Everest is the highest peak on Earth at 8,848m.",
    "Shakespeare wrote 37 plays and 154 sonnets during his lifetime.",
    "The speed of light in vacuum is 299,792,458 meters/second.",
    "Penguins are flightless birds found primarily in the Southern Hemisphere.",
    "The Mona Lisa was painted by Leonardo da Vinci.",
    "Oxygen makes up about 21% of Earth's atmosphere by volume.",
    "The American Civil War lasted from 1861 to 1865.",
    "Antibiotics are ineffective against viral infections like the common cold.",
    "The human heart beats approximately 100,000 times per day.",
    "Gold is one of the least reactive chemical elements.",
    "The first successful powered aircraft flew in 1903.",
    "There are 118 elements in the periodic table."
 ] * 10

 conn_str = "<my-connection-str>"

 client = pymongo.MongoClient(conn_str)
 db_name = "bsontestdb"
 collection_name = "bsontestcol_openai"

 openai_embeddings = openai.embeddings.create(
        model= "text-embedding-ada-002",
        input=sentences
    ).data


 def quantize_32bit_to_8bit(vector):
    # Ensure the input is a NumPy array
    vector = np.array(vector, dtype=np.float32)
    
    # Step 1: Normalize to [-1, 1]
    max_abs = np.max(np.abs(vector))
    normalized = vector / max_abs if max_abs != 0 else vector
    
    # Step 2: Scale to [-128, 127]
    scaled = normalized * 127.5
    
    # Step 3: Round to nearest integer
    rounded = np.round(scaled)
    
    # Step 4: Clip values to ensure they're in the range [-128, 127]
    clipped = np.clip(rounded, -128, 127)
    
    # Step 5: Convert to int8
    quantized = clipped.astype(np.int8)
    
    return quantized

 def generate_bson_vector_with_pymongo(vector, data_type: BinaryVectorDtype):
  return Binary.from_vector(vector, data_type)

 # 1. INGEST + INDEX
 for vector in openai_embeddings:
  quantized_vector = quantize_32bit_to_8bit(vector.embedding)
  bson_vector = generate_bson_vector_with_pymongo(quantized_vector, BinaryVectorDtype.INT8)
  col.insert_one({"int8_openai_vector": bson_vector})
  print("Original vector:", vector.embedding)
  print("Quantized vector:", quantized_vector)
  print("BSON vector:", bson_vector)
  
 search_index_model = SearchIndexModel(
  definition={
    "fields":
                    [{"type": "vector",
                      "path": "int8_openai_vector",
                      "numDimensions": 1536,
                      "similarity": "dotProduct"
                      }
                     ]
  },
  name="myOpenAIbinDataIndex",
  type="vectorSearch",
 )

 result = col.create_search_index(model=search_index_model)
 print(result)

 # 2. QUERY

 def run_vector_search(query_bson_vector, col, index_name, num_candidates=10, limit=10):
  
  results = {}
  pipe = [{'$vectorSearch': {
            "index":  index_name,
            "path": "int8_openai_vector",
            "numCandidates": num_candidates,
            "limit": limit,
            "queryVector": query_bson_vector,
          }
        },
        {
            "$project": {
                "_id": 1,
                "data": 1,
                "score": { "$meta": "vectorSearchScore" }
            }
        }
      ]
  cursor = col.aggregate(pipe)
  results = list(cursor)
  return results


 query_vector = openai.embeddings.create(
        model= "text-embedding-ada-002",
        input=["this is a query"]
    ).data[0].embedding

 quantized_vector = quantize_32bit_to_8bit(query_vector)
 bindata_query_vector = generate_bson_vector_with_pymongo(quantized_vector, BinaryVectorDtype.INT8)

 sentences_query_results = run_vector_search(bindata_query_vector, col, "myOpenAIbinDataIndex")
 print(sentences_query_results)
	# deps

	# %pip install pymongo openai numpy

	import numpy as np
	import os
	os.environ["OPENAI_API_KEY"] = '<my-openai-key>'
	import pymongo
	import openai

	from bson.binary import BinaryVectorDtype, Binary
	from pymongo.operations import SearchIndexModel

	sentences = [
	"Water boils at 100 degrees Celsius at standard atmospheric pressure.",
	"The Great Wall of China is visible from space.",
	"Photosynthesis converts light energy into chemical energy in plants.",
	"DNA contains the genetic instructions for all living organisms.",
	"Jupiter is the largest planet in our solar system.",
	"The Eiffel Tower was completed in Paris in 1889.",
	"Honey never spoils due to its unique chemical properties.",
	"The human body contains approximately 206 bones in adulthood.",
	"Mount Everest is the highest peak on Earth at 8,848m.",
	"Shakespeare wrote 37 plays and 154 sonnets during his lifetime.",
	"The speed of light in vacuum is 299,792,458 meters/second.",
	"Penguins are flightless birds found primarily in the Southern Hemisphere.",
	"The Mona Lisa was painted by Leonardo da Vinci.",
	"Oxygen makes up about 21% of Earth's atmosphere by volume.",
	"The American Civil War lasted from 1861 to 1865.",
	"Antibiotics are ineffective against viral infections like the common cold.",
	"The human heart beats approximately 100,000 times per day.",
	"Gold is one of the least reactive chemical elements.",
	"The first successful powered aircraft flew in 1903.",
	"There are 118 elements in the periodic table."
	] * 10

	conn_str = "<my-connection-str>"

	client = pymongo.MongoClient(conn_str)
	db_name = "bsontestdb"
	collection_name = "bsontestcol_openai"

	openai_embeddings = openai.embeddings.create(
	model= "text-embedding-ada-002",
	input=sentences
	).data


	def quantize_32bit_to_8bit(vector):
	# Ensure the input is a NumPy array
	vector = np.array(vector, dtype=np.float32)

	# Step 1: Normalize to [-1, 1]
	max_abs = np.max(np.abs(vector))
	normalized = vector / max_abs if max_abs != 0 else vector

	# Step 2: Scale to [-128, 127]
	scaled = normalized * 127.5

	# Step 3: Round to nearest integer
	rounded = np.round(scaled)

	# Step 4: Clip values to ensure they're in the range [-128, 127]
	clipped = np.clip(rounded, -128, 127)

	# Step 5: Convert to int8
	quantized = clipped.astype(np.int8)

	return quantized

	def generate_bson_vector_with_pymongo(vector, data_type: BinaryVectorDtype):
	return Binary.from_vector(vector, data_type)

	# 1. INGEST + INDEX
	for vector in openai_embeddings:
	quantized_vector = quantize_32bit_to_8bit(vector.embedding)
	bson_vector = generate_bson_vector_with_pymongo(quantized_vector, BinaryVectorDtype.INT8)
	col.insert_one({"int8_openai_vector": bson_vector})
	print("Original vector:", vector.embedding)
	print("Quantized vector:", quantized_vector)
	print("BSON vector:", bson_vector)

	search_index_model = SearchIndexModel(
	definition={
	"fields":
	[{"type": "vector",
	"path": "int8_openai_vector",
	"numDimensions": 1536,
	"similarity": "dotProduct"
	}
	]
	},
	name="myOpenAIbinDataIndex",
	type="vectorSearch",
	)

	result = col.create_search_index(model=search_index_model)
	print(result)

	# 2. QUERY

	def run_vector_search(query_bson_vector, col, index_name, num_candidates=10, limit=10):

	results = {}
	pipe = [{'$vectorSearch': {
	"index": index_name,
	"path": "int8_openai_vector",
	"numCandidates": num_candidates,
	"limit": limit,
	"queryVector": query_bson_vector,
	}
	},
	{
	"$project": {
	"_id": 1,
	"data": 1,
	"score": { "$meta": "vectorSearchScore" }
	}
	}
	]
	cursor = col.aggregate(pipe)
	results = list(cursor)
	return results


	query_vector = openai.embeddings.create(
	model= "text-embedding-ada-002",
	input=["this is a query"]
	).data[0].embedding

	quantized_vector = quantize_32bit_to_8bit(query_vector)
	bindata_query_vector = generate_bson_vector_with_pymongo(quantized_vector, BinaryVectorDtype.INT8)

	sentences_query_results = run_vector_search(bindata_query_vector, col, "myOpenAIbinDataIndex")
	print(sentences_query_results)