-
-
Save ranfysvalle02/d81289fa9539febce57d2774c68b6ab2 to your computer and use it in GitHub Desktop.
import pymongo | |
import os | |
from openai import AzureOpenAI | |
# Replace with your actual values | |
AZURE_OPENAI_ENDPOINT = "https://DEMO.openai.azure.com" | |
AZURE_OPENAI_API_KEY = "DEMO" | |
deployment_name = "gpt-4-32k" # The name of your model deployment | |
MDB_URI = "" | |
# Authenticate and create client | |
az_client = AzureOpenAI( | |
api_key=AZURE_OPENAI_API_KEY, | |
api_version="2023-03-15-preview", | |
azure_endpoint=AZURE_OPENAI_ENDPOINT | |
) | |
# Connect to your MongoDB instance | |
mdb_client = pymongo.MongoClient(MDB_URI) | |
db = mdb_client["sample_mflix"] | |
movies_collection = db["movies"] | |
MOVIE_DATA = list( movies_collection.find({"imdb.rating":{"$gt":5,"$lt":10}}, {"imdb.rating":1,"genres":1,"_id":0,"year":1,"title":1}).sort("imdb.rating",pymongo.DESCENDING).limit(500)) | |
QUESTION = """ | |
Which movie genres have the highest average ratings{ imdb.rating }? | |
[response format] | |
Genre: { genre }, Average Rating: { avgRating }, Count: { count } | |
WHAT IS THE GENRE WITH THE HIGHEST AVERAGE RATING? | |
""" | |
# Generate text using your deployed model | |
prompt = "[mongodb atlas collection start]\n\n" + str(MOVIE_DATA) + "[mongodb atlas collection end]\n\n USING ONLY THE PREVIOUS MongoDB data, answer the following question: " + QUESTION | |
answer = az_client.chat.completions.create( | |
model=deployment_name, | |
messages=[{"role":"user","content":f"{prompt}"}], | |
stop=None, | |
temperature=0, stream=False | |
) | |
print("Which movie genres have the highest average ratings{ imdb.rating }?") | |
print(answer.choices[0].message.content) | |
print( "NOW FOR THE AGGREGATION PIPELINE MAGIC" ) | |
agg_pipeline_magic = [ | |
{"$unwind": "$genres"}, | |
{ | |
"$group": { | |
"_id": "$genres", | |
"avgRating": {"$avg": "$imdb.rating"}, | |
"count": {"$sum": 1}, | |
} | |
}, | |
{"$sort": {"avgRating": -1}}, | |
{"$limit": 1}, | |
] | |
MOVIE_DATA = list( movies_collection.aggregate(agg_pipeline_magic)) | |
prompt = "[mongodb atlas collection start]\n\n" + str(MOVIE_DATA) + "[mongodb atlas collection end]\n\n USING ONLY THE PREVIOUS MongoDB data, answer the following question: " + QUESTION | |
answer = az_client.chat.completions.create( | |
model=deployment_name, | |
messages=[{"role":"user","content":f"{prompt}"}], | |
stop=None, | |
temperature=0, stream=False | |
) | |
print(answer.choices[0].message.content) | |
""" | |
CORRECT ANSWER -- USING AGG MAGIC | |
Genre: Film-Noir, Average Rating: 7.397402597402598, Count: 77 | |
Genre: Short, Average Rating: 7.377574370709382, Count: 442 | |
Genre: Documentary, Average Rating: 7.365679824561403, Count: 1834 | |
Genre: News, Average Rating: 7.252272727272728, Count: 44 | |
Genre: History, Average Rating: 7.1696100917431185, Count: 874 | |
""" | |
ranfysvalle02
commented
Mar 31, 2024
•
MongoDB + GenAI is more than just vectors
While the Mongo Query Language is pretty powerful by itself, given the current state of LLMs it is still not enough for some tasks.
Example: collection: sample_mflix
Which movie genres have the highest average ratings{ imdb.rating }?
Try this -- and you'll quickly run into 'context limitations', and even if you try to get clever to try to get 'some response' -- like for example:
MOVIE_DATA = list( movies_collection.find({"imdb.rating":{"$gt":5,"$lt":10}}, {"imdb.rating":1,"genres":1,"_id":0,"year":1,"title":1}).sort("imdb.rating",pymongo.DESCENDING).limit(500))
Doing something like the above will make sure it will fit into the context, but even then it will require a level of math and critical reasoning that really highlights the limitations of this technology in its current state (we are in the 'dial-up' phase of this technology)
The MongoDB Aggregation Framework provides several operators that can be challenging to implement with SQL, such as $unwind, $group, $bucket, and $graphLookup. These operators allow for complex data manipulation and analysis, which can be a significant differentiator for GenAI by enabling more sophisticated data processing and insights. You can also do all the math using the Agg Framework, reducing the chance for the LLM to get confused! You can also mathematically verify your results to make sure the output is correct!
- $unwind: This operator deconstructs an array field from the input documents to output a document for each element. Each output document replaces the array with an element value. In SQL, this would require a complex combination of JOINs and UNIONs.
- $group: This operator groups input documents by a specified identifier expression and applies the accumulator expression(s) to each group. Implementing this in SQL can be cumbersome, especially when dealing with non-trivial grouping conditions.
- $bucket: This operator categorizes incoming documents into groups, called buckets, based on a specified expression and bucket boundaries. While SQL has grouping functions, they are not as flexible or as easy to use as $bucket.
- $graphLookup: This operator performs a recursive search on a collection, with options for depth and breadth. This is particularly useful for dealing with hierarchical or graph-based data. SQL would require recursive common table expressions (CTEs) to achieve similar functionality, which can be complex and less performant.