Skip to content

Instantly share code, notes, and snippets.

@up1
Last active January 31, 2025 08:54
Show Gist options
  • Save up1/981a4acc3a50fcb5882c021ee4354df9 to your computer and use it in GitHub Desktop.
Save up1/981a4acc3a50fcb5882c021ee4354df9 to your computer and use it in GitHub Desktop.
MongoDB Atlas local + Docker
$docker compose up -d
$docker compose ps
services:
mongo:
image: mongodb/mongodb-atlas-local
ports:
- "27017:27017"
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel
import os
# MongoDB configuration
MONGO_URI = os.environ.get("MONGO_URI")
DATABASE_NAME = "demo_employees"
COLLECTION_NAME = "employees"
# Connect to MongoDB
def get_mongo_client(mongo_uri):
"""Establish connection to the MongoDB."""
# gateway to interacting with a MongoDB database cluster
client = MongoClient(mongo_uri, appname="demo-rag")
print("Connection to MongoDB successful")
return client
# main function
if __name__ == "__main__":
# 1. Read the data from the CSV file to dictionary
df_employees = pd.read_csv('employees.csv')
# 2. Apply the function to all employees
df_employees['employee_string'] = df_employees.apply(create_employee_string, axis=1)
# 3. Apply the function to generate embeddings for all employees with error handling
try:
df_employees['embedding'] = df_employees['employee_string'].apply(get_embedding)
print("Embeddings generated for employees")
except Exception as e:
print(f"Error applying embedding function to DataFrame: {e}")
# 4. Connect to MongoDB
if not MONGO_URI:
print("MONGO_URI not set in environment variables")
exit(1)
mongo_client = get_mongo_client(MONGO_URI)
# 5. Create a database and collection
db = mongo_client.get_database(DATABASE_NAME)
collection = db.get_collection(COLLECTION_NAME)
# 6. Insert the data into the collection
documents = df_employees.to_dict('records')
collection.delete_many({})
collection.insert_many(documents)
# 7. Create vector index
# Create your index model, then create the search index
search_index_model = SearchIndexModel(
definition={
"fields": [
{
"type": "vector",
"numDimensions": 1536,
"path": "embedding",
"similarity": "cosine"
}
]
},
name="vector_index",
type="vectorSearch",
)
result = collection.create_search_index(model=search_index_model)
print(result)
print("Data ingestion into MongoDB completed")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment