Skip to content

Instantly share code, notes, and snippets.

@codeninja
Last active February 10, 2024 13:37
Show Gist options
  • Save codeninja/d44e2affcd8b23a15698c01387df9529 to your computer and use it in GitHub Desktop.
Save codeninja/d44e2affcd8b23a15698c01387df9529 to your computer and use it in GitHub Desktop.
# from autogen.agentchat.contrib.teachable_agent import TeachableAgent
# from autogen.agentchat.user_proxy_agent import UserProxyAgent
# from autogen.agentchat.conversable_agent import ConversableAgent
import os
import re
import json
import autogen
import autogen.retrieve_utils as retrieve_utils
import chromadb
import feedparser
import requests
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from src.lib.termination_msg import term_eom
from autogen.agentchat.contrib.teachable_agent import TeachableAgent
# Define a path for the JSON file to store structured notes and read status
STRUCTURED_NOTES_DB_PATH = "structured_notes_db.json"
# Function to load structured notes from the JSON file
def load_structured_notes():
if not os.path.exists(STRUCTURED_NOTES_DB_PATH):
return {}
with open(STRUCTURED_NOTES_DB_PATH, "r") as file:
return json.load(file)
# Function to save structured notes to the JSON file
def save_structured_notes(notes_db):
with open(STRUCTURED_NOTES_DB_PATH, "w") as file:
json.dump(notes_db, file, indent=4)
class ArxivAgent(autogen.agentchat.Agent):
def search_by_date(self, start_date, end_date, query, max_results=10):
"""
Search arXiv for papers published between start_date and end_date with a specific query.
:param start_date: The start date for the search in the format YYYY-MM-DD.
:param end_date: The end date for the search in the format YYYY-MM-DD.
:param query: The query to search for.
:param max_results: The maximum number of results to return.
:return: A list of papers that match the query and were published between the start and end dates.
"""
base_url = "http://export.arxiv.org/api/query?"
search_query = (
f"search_query={query}+AND+submittedDate:[{start_date}+TO+{end_date}]"
)
start = 0
max_results = f"max_results={max_results}"
url = f"{base_url}{search_query}&start={start}&{max_results}"
response = requests.get(url)
feed = feedparser.parse(response.content)
papers = [
{
"title": entry.title,
"link": entry.link,
"summary": entry.summary,
"date": entry.published,
"category": entry.arxiv_primary_category["term"]
if "arxiv_primary_category" in entry
else entry.tags[0]["term"],
}
for entry in feed.entries
]
return papers
seed = "arxiv"
def __init__(
self,
name: str,
llm_config: dict = {},
human_input_mode="COMPLETE",
code_execution_config={"work_dir": "arxiv"},
is_termination_msg=term_eom,
):
# Using dict.setdefault to optimize default settings for llm config
llm_config.setdefault("seed", self.seed)
self.seed = llm_config["seed"]
llm_config["config_list"] = autogen.config_list_from_json(
"OAI_CONFIG_LIST",
filter_dict={
"model": [
"gpt-4",
"gpt-4-0613",
"gpt-3.5-turbo",
"gpt-3.5-turbo-0613",
"gpt-3.5-turbo-16k",
"gpt-3.5-turbo-16k-0613",
"gpt-4-1106-preview",
],
},
)
llm_config.setdefault("temperature", 0)
llm_config.setdefault("model", "gpt-3.5-turbo-0613")
llm_config.setdefault(
"functions",
[
self.queryFunction,
self.downloadFunction,
self.summarizeFunction,
self.readPdfFunction,
],
)
system_message = """You are a research librarian tracking scientific papers.
You have several tasks you can complete:
- /chat: [default] chat with the user, answering questions about research you've read.
- /search: query for new papers on a topic with the query_arxiv function.
- /searchResults: You must summarize the result and print the Date, Title, Category, Arxiv Link, PDF Link, and Summary in markdown format.
- /download: download a pdf from a url with the download_pdf function
- /read: open the pdf and extract the text using the read_pdf function. After you read the pdf, you must create tangiable structured notes on the paper starting with the title, summary, key details, learnings, recomendations, potential applications. Include critical details that we would need to be able to recall in planning sessions when discussing future product ideas. The goal is to be able to link cutting edge research to product ideas.
- /summarize: summarize a paper into a short paragraph with the summarize_paper function, effects, and significance
- /notate: generate detailed structured notes on a paper with the write_notes function
- /report: Provide a report when provided research data detailing the function, effects, and significance of all the research combined.
- /help: print this message
- /terminate: terminate the conversation
Once a command is complete, append a `TERMINATE` message to the end of the message to terminate the conversation.
The user can not execute code directly. They must use the functions provided.
"""
self.teach_config = {
"verbosity": 1, # 0 for basic info, 1 to add memory operations, 2 for analyzer messages, 3 for memo lists.
"reset_db": False, # Set to True to start over with an empty database.
"path_to_db_dir": ".cache/research", # Path to the directory where the database will be stored.
"recall_threshold": 1.5, # Higher numbers allow more (but less relevant) memos to be recalled.
}
self.agent = TeachableAgent(
name="teachableagent",
llm_config=llm_config,
teach_config=self.teach_config,
system_message=system_message,
)
self.function_map = {
"query_arxiv": self.query_arxiv,
"download_pdf": self.download_pdf,
"summarize_paper": self.summarize_paper,
"read_pdf": self.read_pdf,
}
self.agent.register_function(self.function_map)
ragConfig = {
"task": "text_to_text_generation",
"docs_path": ["/src/global_context/research/"],
"chunk_token_size": 1000,
"model": llm_config["config_list"][0]["model"],
"client": chromadb.PersistentClient(path="./arxiv/chromadb"),
"collection_name": "arxiv",
"get_or_create": True,
}
self.ragAgent = RetrieveAssistantAgent(
name="RagAgent",
llm_config=llm_config,
system_message="RagAgent. Retrieve the answer from the knowledge base.",
human_input_mode="COMPLETE",
code_execution_config={"work_dir": "arxiv"},
)
self.ragUserProxy = RetrieveUserProxyAgent(
name="RagUserProxy",
human_input_mode="NEVER",
retrieve_config=ragConfig,
)
self.userProxy = autogen.UserProxyAgent(
name="User",
human_input_mode="ALWAYS",
code_execution_config={"work_dir": "arxiv"},
)
self.critic = autogen.ConversableAgent(
name="Critic",
llm_config={
"temperature": 0.2,
"request_timeout": 600,
"seed": "arxiv",
"model": "gpt-3.5-turbo-0613",
"config_list": autogen.config_list_openai_aoai(exclude="aoai"),
},
human_input_mode="COMPLETE",
system_message="Critic. Critique the plan, the execution, the result, and the conversation. Do not critique the user.",
)
self.groupchat = autogen.GroupChat(
agents=[
# self.userProxy,
# self.ragAgent,
# self.ragUserProxy,
self.agent,
self.critic,
],
messages=[],
max_round=50,
)
self.groupchatManager = autogen.GroupChatManager(
groupchat=self.groupchat, llm_config=llm_config
)
# Initialize any additional state or configuration here
def get_agent(self):
return self.agent
queryFunction = {
"name": "query_arxiv",
"description": "query arxiv for a topic",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The query to search for.",
},
"max_results": {
"type": "integer",
"description": "The maximum number of results to return.",
},
},
"required": ["query"],
},
}
def query_arxiv(
self,
query: str,
max_results: int = 10,
start_date: str = None,
end_date: str = None,
):
base_url = "http://export.arxiv.org/api/query?"
search_query = f"search_query=all:{query}"
if start_date and end_date:
search_query += f"+AND+submittedDate:[{start_date}+TO+{end_date}]"
start = 0
max_results = f"max_results={max_results}"
url = f"{base_url}{search_query}&start={start}&{max_results}"
response = requests.get(url)
feed = feedparser.parse(response.content)
papers = [
{
"title": entry.title,
"link": entry.link,
"summary": entry.summary,
"date": entry.published,
"category": entry.arxiv_primary_category["term"]
if "arxiv_primary_category" in entry
else entry.tags[0]["term"],
}
for entry in feed.entries
]
return "/searchResults " + str(papers)
downloadFunction = {
"name": "download_pdf",
"description": "download a pdf from a url",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The url to download the pdf from.",
},
"filename": {
"type": "string",
"description": "The filename to save the pdf as. This should match ArXiv's file name.",
},
},
"required": ["url"],
},
}
def download_pdf(self, url: str, filename: str) -> str:
"""
Download a pdf from a url and save it in a topic categorized folder.
:param url: The url to download the pdf from.
:param topic: The research topic to categorize the pdf.
:return: The path to the downloaded pdf.
"""
# Sanitize the topic string to create a valid directory name
# Create the directory path for the topic
topic_dir = os.path.join("src", "global_context", "research")
os.makedirs(topic_dir, exist_ok=True)
# Sanitize the filename string to create a valid filename make sure to include the .pdf extension
sanitized_filename = (
re.sub(r"[^\w\s-]", "", filename.replace(".pdf", ""))
.strip()
.lower()
.replace(" ", "_")
+ ".pdf"
)
# Create the full path for the pdf
pdf_path = os.path.join(topic_dir, sanitized_filename)
# Download and save the pdf
response = requests.get(url)
with open(pdf_path, "wb") as f:
f.write(response.content)
return pdf_path
summarizeFunction = {
"name": "summarize_paper",
"description": "summarize a paper into a short paragraph",
"parameters": {
"type": "object",
"properties": {
"filename": {
"type": "string",
"description": "The path to the pdf to summarize.",
},
},
"required": ["filename"],
},
}
def summarize_paper(self, filename: str) -> str:
self.ragUserProxy.initiate_chat(
self.ragAgent,
context=f"/summarize the paper {filename}",
)
return self.ragAgent.last_message()["content"]
readPdfFunction = {
"name": "read_pdf",
"description": "read a pdf and extract the text",
"parameters": {
"type": "object",
"properties": {
"filename": {
"type": "string",
"description": "The filename of the pdf to read.",
},
},
"required": ["filename"],
},
}
def read_pdf(self, filename: str) -> str:
# Load the structured notes database
notes_db = load_structured_notes()
# Check if the PDF has been read previously
if filename in notes_db:
# Return the structured notes if available
return notes_db[filename]["notes"]
# Read the PDF and generate structured notes
file_dir = os.path.join("src", "global_context", "research", filename)
structured_notes = retrieve_utils.extract_text_from_pdf(file_dir)
# Save the structured notes and read status to the database
notes_db[filename] = {"notes": structured_notes, "read": True}
save_structured_notes(notes_db)
return structured_notes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment