Created
July 20, 2023 13:34
-
-
Save janakiramm/5977e79ce11c24b481820b43be4ac57a to your computer and use it in GitHub Desktop.
Implementing RAG with OpenAI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openai | |
import tiktoken | |
from scipy import spatial | |
import pandas as pd | |
df=pd.read_csv('./data/oscars.csv') | |
print(df.head()) | |
df=df.loc[df['year_ceremony'] == 2023] | |
df=df.dropna(subset=['film']) | |
df['category'] = df['category'].str.lower() | |
df.head() | |
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award' | |
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win' | |
print(df.head()['text']) | |
df=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x)))) | |
print(df.head()) | |
def strings_ranked_by_relatedness( | |
query: str, | |
df: pd.DataFrame, | |
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y), | |
top_n: int = 100 | |
) -> tuple[list[str], list[float]]: | |
EMBEDDING_MODEL = "text-embedding-ada-002" | |
query_embedding_response = openai.Embedding.create( | |
model=EMBEDDING_MODEL, | |
input=query, | |
) | |
query_embedding = query_embedding_response["data"][0]["embedding"] | |
strings_and_relatednesses = [ | |
(row["text"], relatedness_fn(query_embedding, row["embedding"])) | |
for i, row in df.iterrows() | |
] | |
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True) | |
strings, relatednesses = zip(*strings_and_relatednesses) | |
return strings[:top_n], relatednesses[:top_n] | |
strings, relatednesses = strings_ranked_by_relatedness("Lady Gaga", df, top_n=3) | |
for string, relatedness in zip(strings, relatednesses): | |
print(f"{relatedness=:.3f}") | |
display(string) | |
def num_tokens(text: str) -> int: | |
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
return len(encoding.encode(text)) | |
def query_message( | |
query: str, | |
df: pd.DataFrame, | |
model: str, | |
token_budget: int | |
) -> str: | |
strings, relatednesses = strings_ranked_by_relatedness(query, df) | |
introduction = 'Use the below content related to 95th Oscar awards to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."' | |
question = f"\n\nQuestion: {query}" | |
message = introduction | |
for string in strings: | |
next_row = f'\n\nOscar database section:\n"""\n{string}\n"""' | |
if ( | |
num_tokens(message + next_row + question) | |
> token_budget | |
): | |
break | |
else: | |
message += next_row | |
return message + question | |
def ask( | |
query: str, | |
df: pd.DataFrame = df, | |
model: str = "gpt-3.5-turbo", | |
token_budget: int = 4096 - 500, | |
print_message: bool = False, | |
) -> str: | |
message = query_message(query, df, model=model, token_budget=token_budget) | |
if print_message: | |
print(message) | |
messages = [ | |
{"role": "system", "content": "You answer questions about 95th Oscar awards."}, | |
{"role": "user", "content": message}, | |
] | |
response = openai.ChatCompletion.create( | |
model=model, | |
messages=messages, | |
temperature=0 | |
) | |
response_message = response["choices"][0]["message"]["content"] | |
return response_message | |
print(ask('What was the nomination from Lady Gaga for the 95th Oscars?')) | |
print(ask('What were the nominations for the music awards?')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment