-
-
Save lifan0127/e34bb0cfbf7f03dc6852fd3e80b8fb19 to your computer and use it in GitHub Desktop.
import os | |
os.environ['OPENAI_API_KEY'] = '<Your OpenAI API Key>' | |
# See here on how to find your Zotero info: https://github.com/urschrei/pyzotero#quickstart | |
ZOTERO_USER_ID = '<Your Zotero User ID>' | |
ZOTERO_API_KEY = '<Your Zotero API Key>' | |
ZOTERO_COLLECTION_ID = '<Your Zotero Collection ID>' | |
question = 'What predictive models are used in materials discovery?' | |
# The following prompt instruction is injected to limit the number of keywords per query | |
question_prompt = 'A "keyword search" is a list of no more than 3 words, which separated by whitespace only and with no boolean operators (e.g. "dog canine puppy"). Avoid adding any new words not in the question unless they are synonyms to the existing words.' | |
from paperqa import Docs | |
from pyzotero import zotero | |
import requests | |
import shutil, sys, re | |
from bs4 import BeautifulSoup | |
docs = Docs() | |
queries = docs.generate_search_query(question + '\n' + question_prompt) | |
print(f'Search queries: {", ".join(queries)}') | |
zot = zotero.Zotero(ZOTERO_USER_ID, 'user', ZOTERO_API_KEY) | |
searches = [zot.collection_items( | |
ZOTERO_COLLECTION_ID, | |
q=q.strip('"'), | |
limit=10, | |
itemType='attachment', | |
qmode='everything' | |
) for q in queries] | |
attachments = {item['key']: item for search in searches for item in search if item['data']['contentType'] == 'application/pdf'}.values() | |
parents = set([a['data']['parentItem'] for a in attachments]) | |
citation_dict = {p: zot.item(p, content='bib', style='american-chemical-society')[0] for p in parents} | |
result_count = len(parents) | |
if (result_count == 0): | |
print(f'No matched results in Zotero') | |
sys.exit() | |
print(f'Results: {result_count}') | |
paths = [] | |
citations = [] | |
for attachment in attachments: | |
link_mode = attachment['data']['linkMode'] | |
file_path = f'data/zotero/{attachment["key"]}.pdf' | |
parent = citation_dict[attachment['data']['parentItem']] | |
if link_mode == 'imported_file': | |
zot.dump(attachment['key'], f'{attachment["key"]}.pdf', 'data/zotero') | |
elif link_mode == 'linked_file': | |
shutil.copy(attachment['data']['path'], file_path) | |
elif link_mode == 'imported_url': | |
res = requests.get(attachment['data']['url']) | |
with open(file_path, 'wb') as f: | |
f.write(res.content) | |
else: | |
raise ValueError(f'Unsupported link mode: {link_mode} for {attachment["key"]}.') | |
paths.append(file_path) | |
citations.append(re.sub("^\(\d+\)\s+", "", BeautifulSoup(parent, 'html.parser').get_text().strip())) | |
for d, c in zip(paths, citations): | |
docs.add(d, c) | |
answer = docs.query(question) | |
with open('data/zotero-answer.txt', 'w') as f: | |
f.write(answer.formatted_answer) | |
Thank you - did you or anyone play around with the different libraries, would there be a reason for one outperforming the rest when it comes to academic Q&A tasks?
Yes, I think the retrieval (vector embeddings) are based on cosine similarity metric for FAISS / GPT etc., the Q&A performance depends largely on whether the model is fine-tuned and/or prompt templates (see langchain library).
I created a summarization and Q&A app also using GPT (and NLTK library for chunking / tokenization). Seems to work well for <15 pages. I'm keeping it open for folks to test for a couple days - please do check out and provide feedback:
https://powerful-dusk-64631.herokuapp.com/
Thanks,
Jalal
Hey,
I wanted to try out the script but it seems that the paper-qa package is updated and the generate_search_query method doesn't exist anymore. I tried a workaround but I am not sure, if it is correct, since it doesn't use a method from Docs anymore. Can someone have a look?
Note: I also tried to change ChatGPT to llama2
See here on how to find your Zotero info: https://github.com/urschrei/pyzotero#quickstart
ZOTERO_USER_ID = ''
ZOTERO_API_KEY = ''
ZOTERO_COLLECTION_ID = ''
question = 'How is deep learning used for clustering mass spectra?'
The following prompt instruction is injected to limit the number of keywords per query
question_prompt = 'A "keyword search" is a list of no more than 3 words, which separated by whitespace only and with no boolean operators (e.g. "dog canine puppy"). Avoid adding any new words not in the question unless they are synonyms to the existing words.'
from bs4 import BeautifulSoup
import requests
import shutil
import re
from paperqa import Docs
from pyzotero import zotero
import requests
import shutil, sys, re
from bs4 import BeautifulSoup
Your Docs class implementation here
docs = Docs()
Generate search queries manually (assuming you don't have the generate_search_query method)
keywords = [word.lower() for word in question.split() if len(word) > 2]
queries = [f'"{keyword}"' for keyword in keywords]
print(queries)
zot = zotero.Zotero(ZOTERO_USER_ID, 'user', ZOTERO_API_KEY)
searches = [zot.collection_items(
ZOTERO_COLLECTION_ID,
q=q.strip('"'),
limit=10,
itemType='attachment',
qmode='everything'
) for q in queries]
print(f'searches:{searches}')
attachments = {item['key']: item for search in searches for item in search if item['data']['contentType'] == 'application/pdf'}.values()
parents = set([a['data']['parentItem'] for a in attachments])
citation_dict = {p: zot.item(p, content='bib', style='american-chemical-society')[0] for p in parents}
result_count = len(parents)
print(f'attachments:{attachments}')
print(f'parents:{parents}')
if result_count == 0:
print(f'No matched results in Zotero')
sys.exit()
print(f'Results: {result_count}')
Define the directory where PDF files will be saved
pdf_directory = 'data/zotero_pdfs/'
if not os.path.exists(pdf_directory):
os.makedirs(pdf_directory)
paths = []
citations = []
for attachment in attachments:
link_mode = attachment['data']['linkMode']
file_path = os.path.join(pdf_directory, f'{attachment["key"]}.pdf')
parent = citation_dict[attachment['data']['parentItem']]
if link_mode == 'imported_file':
zot.dump(attachment['key'], f'{attachment["key"]}.pdf', pdf_directory)
elif link_mode == 'linked_file':
shutil.copy(attachment['data']['path'], file_path)
elif link_mode == 'imported_url':
res = requests.get(attachment['data']['url'])
with open(file_path, 'wb') as f:
f.write(res.content)
else:
raise ValueError(f'Unsupported link mode: {link_mode} for {attachment["key"]}.')
paths.append(file_path)
citations.append(re.sub("^(\d+)\s+", "", BeautifulSoup(parent, 'html.parser').get_text().strip()))
for d, c in zip(paths, citations):
docs.add(d, c)
answer = docs.query(question)
print (answer)
with open('data/zotero-answer.txt', 'w') as f:
f.write(answer.formatted_answer)
`
@JannikSchneider12 This script was written several months ago. It hasn't been tested with the latest paper-qa
release.
Meanwhile, I see the paper-qa
package has added integration with Zotero. Have you checked it out yet?
https://github.com/whitead/paper-qa/blob/main/paperqa/contrib/zotero.py
@lifan0127 Thanks for your reply. I will have a look at it, but I am still at the very beginning regarding programming.
Btw is there a way to still run your script if I use the exact versions that you used there?
Again thanks for your help and time
@JannikSchneider12 Please check out this Hugging Face space: https://huggingface.co/spaces/lifan0127/zotero-qa, where you can ask questions based on your Zotero library without programming.
Also, I am working on a Zotero plugin to incorporate paper QA, among other feature, into Zotero. Please check out if you are interested: https://github.com/lifan0127/ai-research-assistant
This is interesting, thank you. I really like the idea of optimizing literature reviews using tools like Paper QA and Zotero. These tools can greatly simplify and speed up the process of searching and analyzing scientific articles, helping you save time and improve the quality of your work. I had a similar project and I asked do my homework, I found https://edubirdie.com/do-my-homework for this. Now I know a lot about this myself and can give a lot of advice. The main thing is to use these tools effectively.
@andreifoldes I think the retrieval mechanisms are the same. This approach uses the FAISS library for vector similarity based search to find relevant document chunks and then feed them into LLMs for response synthesis.