Last active
February 13, 2024 14:39
-
-
Save init27/3c9ff5d275bcaf09936851ab9bd843c8 to your computer and use it in GitHub Desktop.
ArXiv Chat: Chat with the latest Arxiv papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Credit 🙏: I just used the example from langchain docs and it works quite well: https://python.langchain.com/en/latest/use_cases/question_answering.html | |
# Note 2: The Arxiv -> PDF logic is a bit messy, I'm sure it can be done better | |
# Note 3: Please install the following: | |
# To run: | |
# Save this in a `app.py` | |
# pip install arxiv PyPDF2 langchain chromadb | |
# The chat feature was shipped in H2O nightly this week, we will need to install from nightly link: | |
# pip install https://github.com/h2oai/wave/releases/download/nightly/h2o_wave-nightly-py3-none-manylinux1_x86_64.whl | |
# Before running the app, make sure you export your openai key as an environment variable: | |
# export OPENAI_API_KEY = "YOUR_KEY_GOES_HERE" | |
# Run the following command and navigate to localhost:10101/demo, Note: Don't put app.py, just app 👇 | |
# wave run app | |
# You will need to put the paper number and then ask questions | |
# Shout at me for errors: https://twitter.com/bhutanisanyam1 | |
import arxiv | |
import PyPDF2 | |
import io | |
from langchain.chains import RetrievalQA | |
from langchain.llms import OpenAI | |
from langchain.document_loaders import TextLoader | |
from langchain.indexes import VectorstoreIndexCreator | |
from h2o_wave import main, app, Q, ui, data | |
MAX_MESSAGES = 500 # Maximum number of messages to store in the chat window | |
index = None # Initialize the global index variable | |
# Function to convert PDF to text | |
def pdf_to_txt(file_path): | |
with open(file_path, 'rb') as f: | |
pdf_reader = PyPDF2.PdfFileReader(f) | |
text = '' | |
# Extract text from each page of the PDF | |
for page_num in range(pdf_reader.numPages): | |
text += pdf_reader.getPage(page_num).extractText() | |
return text | |
# Function to prepare the index for querying | |
def prepare_index(txt_file_path): | |
global index | |
loader = TextLoader(txt_file_path) | |
index = VectorstoreIndexCreator().from_loaders([loader]) | |
# Main app function that handles the UI and logic | |
@app('/demo') | |
async def serve(q: Q): | |
global index | |
# Initialize the UI if it hasn't been initialized yet | |
if not q.client.initialized: | |
# Set up the UI theme | |
q.page['meta'] = ui.meta_card( | |
box='', | |
themes=[ | |
ui.theme( | |
name='dracula', | |
primary='#50fa7b', | |
text='#f8f8f2', | |
card='#282a36', | |
page='#282a36', | |
) | |
], | |
theme='dracula' | |
) | |
# Set up the chat title | |
q.page['chat_title'] = ui.form_card( | |
box='3 1 9 1', | |
items=[ | |
ui.text('<h1 style="text-align: center; color: #50fa7b;">ArXiv Chat: Chat with Latest Papers</h1>'), | |
] | |
) | |
# Set up the chat window | |
cyclic_buffer = data(fields='msg fromUser', size=-MAX_MESSAGES) | |
q.page['chat'] = ui.chatbot_card(box='3 2 9 8', data=cyclic_buffer, name='chat') | |
# Set up the input form for Arxiv ID and query | |
q.page['input'] = ui.form_card(box='3 8 9 3', items=[ | |
ui.textbox(name='arxiv_id', label='Arxiv ID', placeholder='Enter Arxiv ID (e.g. 2303.17580)', disabled=q.client.arxiv_id_set), | |
ui.textbox(name='query', label='Query', placeholder='Enter your query'), | |
ui.buttons(items=[ui.button(name='submit', label='Submit', primary=True)]), | |
]) | |
# Initialize client variables | |
q.client.arxiv_id = None | |
q.client.arxiv_id_set = False | |
q.client.initialized = True | |
await q.page.save() | |
# Handle the submit button click | |
if q.args.submit: | |
user_query = q.args.query | |
# If Arxiv ID is not set, set it | |
if not q.client.arxiv_id: | |
q.client.arxiv_id = q.args.arxiv_id | |
# Check if Arxiv ID is valid | |
if not q.client.arxiv_id: | |
q.page['chat'].data[-1] = ["Please enter a valid Arxiv ID.", False] | |
await q.page.save() | |
return | |
# Download the paper and convert it to text | |
search = arxiv.Search(id_list=[q.client.arxiv_id]) | |
paper = next(search.results()) | |
paper.download_pdf(filename="downloaded-paper.pdf") | |
pdf_file_path = 'downloaded-paper.pdf' | |
txt_file_path = 'converted-paper.txt' | |
text = pdf_to_txt(pdf_file_path) | |
# Save the text to a file | |
with io.open(txt_file_path, 'w', encoding='utf-8') as f: | |
f.write(text) | |
# Prepare the index for querying | |
prepare_index(txt_file_path) | |
# Query the index if it exists | |
if index: | |
response = index.query(user_query) | |
# Append user message | |
q.page['chat'].data[-1] = [f"User: {user_query}", True] | |
# Append bot response | |
q.page['chat'].data[-1] = [f"Bot: {response}", False] | |
q.client.arxiv_id_set = True | |
await q.page.save() | |
# Main entry point | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment