Last active
February 10, 2024 13:37
-
-
Save codeninja/d44e2affcd8b23a15698c01387df9529 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from autogen.agentchat.contrib.teachable_agent import TeachableAgent | |
# from autogen.agentchat.user_proxy_agent import UserProxyAgent | |
# from autogen.agentchat.conversable_agent import ConversableAgent | |
import os | |
import re | |
import json | |
import autogen | |
import autogen.retrieve_utils as retrieve_utils | |
import chromadb | |
import feedparser | |
import requests | |
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent | |
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent | |
from src.lib.termination_msg import term_eom | |
from autogen.agentchat.contrib.teachable_agent import TeachableAgent | |
# Define a path for the JSON file to store structured notes and read status | |
STRUCTURED_NOTES_DB_PATH = "structured_notes_db.json" | |
# Function to load structured notes from the JSON file | |
def load_structured_notes(): | |
if not os.path.exists(STRUCTURED_NOTES_DB_PATH): | |
return {} | |
with open(STRUCTURED_NOTES_DB_PATH, "r") as file: | |
return json.load(file) | |
# Function to save structured notes to the JSON file | |
def save_structured_notes(notes_db): | |
with open(STRUCTURED_NOTES_DB_PATH, "w") as file: | |
json.dump(notes_db, file, indent=4) | |
class ArxivAgent(autogen.agentchat.Agent): | |
def search_by_date(self, start_date, end_date, query, max_results=10): | |
""" | |
Search arXiv for papers published between start_date and end_date with a specific query. | |
:param start_date: The start date for the search in the format YYYY-MM-DD. | |
:param end_date: The end date for the search in the format YYYY-MM-DD. | |
:param query: The query to search for. | |
:param max_results: The maximum number of results to return. | |
:return: A list of papers that match the query and were published between the start and end dates. | |
""" | |
base_url = "http://export.arxiv.org/api/query?" | |
search_query = ( | |
f"search_query={query}+AND+submittedDate:[{start_date}+TO+{end_date}]" | |
) | |
start = 0 | |
max_results = f"max_results={max_results}" | |
url = f"{base_url}{search_query}&start={start}&{max_results}" | |
response = requests.get(url) | |
feed = feedparser.parse(response.content) | |
papers = [ | |
{ | |
"title": entry.title, | |
"link": entry.link, | |
"summary": entry.summary, | |
"date": entry.published, | |
"category": entry.arxiv_primary_category["term"] | |
if "arxiv_primary_category" in entry | |
else entry.tags[0]["term"], | |
} | |
for entry in feed.entries | |
] | |
return papers | |
seed = "arxiv" | |
def __init__( | |
self, | |
name: str, | |
llm_config: dict = {}, | |
human_input_mode="COMPLETE", | |
code_execution_config={"work_dir": "arxiv"}, | |
is_termination_msg=term_eom, | |
): | |
# Using dict.setdefault to optimize default settings for llm config | |
llm_config.setdefault("seed", self.seed) | |
self.seed = llm_config["seed"] | |
llm_config["config_list"] = autogen.config_list_from_json( | |
"OAI_CONFIG_LIST", | |
filter_dict={ | |
"model": [ | |
"gpt-4", | |
"gpt-4-0613", | |
"gpt-3.5-turbo", | |
"gpt-3.5-turbo-0613", | |
"gpt-3.5-turbo-16k", | |
"gpt-3.5-turbo-16k-0613", | |
"gpt-4-1106-preview", | |
], | |
}, | |
) | |
llm_config.setdefault("temperature", 0) | |
llm_config.setdefault("model", "gpt-3.5-turbo-0613") | |
llm_config.setdefault( | |
"functions", | |
[ | |
self.queryFunction, | |
self.downloadFunction, | |
self.summarizeFunction, | |
self.readPdfFunction, | |
], | |
) | |
system_message = """You are a research librarian tracking scientific papers. | |
You have several tasks you can complete: | |
- /chat: [default] chat with the user, answering questions about research you've read. | |
- /search: query for new papers on a topic with the query_arxiv function. | |
- /searchResults: You must summarize the result and print the Date, Title, Category, Arxiv Link, PDF Link, and Summary in markdown format. | |
- /download: download a pdf from a url with the download_pdf function | |
- /read: open the pdf and extract the text using the read_pdf function. After you read the pdf, you must create tangiable structured notes on the paper starting with the title, summary, key details, learnings, recomendations, potential applications. Include critical details that we would need to be able to recall in planning sessions when discussing future product ideas. The goal is to be able to link cutting edge research to product ideas. | |
- /summarize: summarize a paper into a short paragraph with the summarize_paper function, effects, and significance | |
- /notate: generate detailed structured notes on a paper with the write_notes function | |
- /report: Provide a report when provided research data detailing the function, effects, and significance of all the research combined. | |
- /help: print this message | |
- /terminate: terminate the conversation | |
Once a command is complete, append a `TERMINATE` message to the end of the message to terminate the conversation. | |
The user can not execute code directly. They must use the functions provided. | |
""" | |
self.teach_config = { | |
"verbosity": 1, # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists. | |
"reset_db": False, # Set to True to start over with an empty database. | |
"path_to_db_dir": ".cache/research", # Path to the directory where the database will be stored. | |
"recall_threshold": 1.5, # Higher numbers allow more (but less relevant) memos to be recalled. | |
} | |
self.agent = TeachableAgent( | |
name="teachableagent", | |
llm_config=llm_config, | |
teach_config=self.teach_config, | |
system_message=system_message, | |
) | |
self.function_map = { | |
"query_arxiv": self.query_arxiv, | |
"download_pdf": self.download_pdf, | |
"summarize_paper": self.summarize_paper, | |
"read_pdf": self.read_pdf, | |
} | |
self.agent.register_function(self.function_map) | |
ragConfig = { | |
"task": "text_to_text_generation", | |
"docs_path": ["/src/global_context/research/"], | |
"chunk_token_size": 1000, | |
"model": llm_config["config_list"][0]["model"], | |
"client": chromadb.PersistentClient(path="./arxiv/chromadb"), | |
"collection_name": "arxiv", | |
"get_or_create": True, | |
} | |
self.ragAgent = RetrieveAssistantAgent( | |
name="RagAgent", | |
llm_config=llm_config, | |
system_message="RagAgent. Retrieve the answer from the knowledge base.", | |
human_input_mode="COMPLETE", | |
code_execution_config={"work_dir": "arxiv"}, | |
) | |
self.ragUserProxy = RetrieveUserProxyAgent( | |
name="RagUserProxy", | |
human_input_mode="NEVER", | |
retrieve_config=ragConfig, | |
) | |
self.userProxy = autogen.UserProxyAgent( | |
name="User", | |
human_input_mode="ALWAYS", | |
code_execution_config={"work_dir": "arxiv"}, | |
) | |
self.critic = autogen.ConversableAgent( | |
name="Critic", | |
llm_config={ | |
"temperature": 0.2, | |
"request_timeout": 600, | |
"seed": "arxiv", | |
"model": "gpt-3.5-turbo-0613", | |
"config_list": autogen.config_list_openai_aoai(exclude="aoai"), | |
}, | |
human_input_mode="COMPLETE", | |
system_message="Critic. Critique the plan, the execution, the result, and the conversation. Do not critique the user.", | |
) | |
self.groupchat = autogen.GroupChat( | |
agents=[ | |
# self.userProxy, | |
# self.ragAgent, | |
# self.ragUserProxy, | |
self.agent, | |
self.critic, | |
], | |
messages=[], | |
max_round=50, | |
) | |
self.groupchatManager = autogen.GroupChatManager( | |
groupchat=self.groupchat, llm_config=llm_config | |
) | |
# Initialize any additional state or configuration here | |
def get_agent(self): | |
return self.agent | |
queryFunction = { | |
"name": "query_arxiv", | |
"description": "query arxiv for a topic", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"query": { | |
"type": "string", | |
"description": "The query to search for.", | |
}, | |
"max_results": { | |
"type": "integer", | |
"description": "The maximum number of results to return.", | |
}, | |
}, | |
"required": ["query"], | |
}, | |
} | |
def query_arxiv( | |
self, | |
query: str, | |
max_results: int = 10, | |
start_date: str = None, | |
end_date: str = None, | |
): | |
base_url = "http://export.arxiv.org/api/query?" | |
search_query = f"search_query=all:{query}" | |
if start_date and end_date: | |
search_query += f"+AND+submittedDate:[{start_date}+TO+{end_date}]" | |
start = 0 | |
max_results = f"max_results={max_results}" | |
url = f"{base_url}{search_query}&start={start}&{max_results}" | |
response = requests.get(url) | |
feed = feedparser.parse(response.content) | |
papers = [ | |
{ | |
"title": entry.title, | |
"link": entry.link, | |
"summary": entry.summary, | |
"date": entry.published, | |
"category": entry.arxiv_primary_category["term"] | |
if "arxiv_primary_category" in entry | |
else entry.tags[0]["term"], | |
} | |
for entry in feed.entries | |
] | |
return "/searchResults " + str(papers) | |
downloadFunction = { | |
"name": "download_pdf", | |
"description": "download a pdf from a url", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"url": { | |
"type": "string", | |
"description": "The url to download the pdf from.", | |
}, | |
"filename": { | |
"type": "string", | |
"description": "The filename to save the pdf as. This should match ArXiv's file name.", | |
}, | |
}, | |
"required": ["url"], | |
}, | |
} | |
def download_pdf(self, url: str, filename: str) -> str: | |
""" | |
Download a pdf from a url and save it in a topic categorized folder. | |
:param url: The url to download the pdf from. | |
:param topic: The research topic to categorize the pdf. | |
:return: The path to the downloaded pdf. | |
""" | |
# Sanitize the topic string to create a valid directory name | |
# Create the directory path for the topic | |
topic_dir = os.path.join("src", "global_context", "research") | |
os.makedirs(topic_dir, exist_ok=True) | |
# Sanitize the filename string to create a valid filename make sure to include the .pdf extension | |
sanitized_filename = ( | |
re.sub(r"[^\w\s-]", "", filename.replace(".pdf", "")) | |
.strip() | |
.lower() | |
.replace(" ", "_") | |
+ ".pdf" | |
) | |
# Create the full path for the pdf | |
pdf_path = os.path.join(topic_dir, sanitized_filename) | |
# Download and save the pdf | |
response = requests.get(url) | |
with open(pdf_path, "wb") as f: | |
f.write(response.content) | |
return pdf_path | |
summarizeFunction = { | |
"name": "summarize_paper", | |
"description": "summarize a paper into a short paragraph", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"filename": { | |
"type": "string", | |
"description": "The path to the pdf to summarize.", | |
}, | |
}, | |
"required": ["filename"], | |
}, | |
} | |
def summarize_paper(self, filename: str) -> str: | |
self.ragUserProxy.initiate_chat( | |
self.ragAgent, | |
context=f"/summarize the paper {filename}", | |
) | |
return self.ragAgent.last_message()["content"] | |
readPdfFunction = { | |
"name": "read_pdf", | |
"description": "read a pdf and extract the text", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"filename": { | |
"type": "string", | |
"description": "The filename of the pdf to read.", | |
}, | |
}, | |
"required": ["filename"], | |
}, | |
} | |
def read_pdf(self, filename: str) -> str: | |
# Load the structured notes database | |
notes_db = load_structured_notes() | |
# Check if the PDF has been read previously | |
if filename in notes_db: | |
# Return the structured notes if available | |
return notes_db[filename]["notes"] | |
# Read the PDF and generate structured notes | |
file_dir = os.path.join("src", "global_context", "research", filename) | |
structured_notes = retrieve_utils.extract_text_from_pdf(file_dir) | |
# Save the structured notes and read status to the database | |
notes_db[filename] = {"notes": structured_notes, "read": True} | |
save_structured_notes(notes_db) | |
return structured_notes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment