Created
March 8, 2023 02:53
-
-
Save lifan0127/e34bb0cfbf7f03dc6852fd3e80b8fb19 to your computer and use it in GitHub Desktop.
Streamlining Literature Reviews with Paper QA and Zotero
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.environ['OPENAI_API_KEY'] = '<Your OpenAI API Key>' | |
# See here on how to find your Zotero info: https://github.com/urschrei/pyzotero#quickstart | |
ZOTERO_USER_ID = '<Your Zotero User ID>' | |
ZOTERO_API_KEY = '<Your Zotero API Key>' | |
ZOTERO_COLLECTION_ID = '<Your Zotero Collection ID>' | |
question = 'What predictive models are used in materials discovery?' | |
# The following prompt instruction is injected to limit the number of keywords per query | |
question_prompt = 'A "keyword search" is a list of no more than 3 words, which separated by whitespace only and with no boolean operators (e.g. "dog canine puppy"). Avoid adding any new words not in the question unless they are synonyms to the existing words.' | |
from paperqa import Docs | |
from pyzotero import zotero | |
import requests | |
import shutil, sys, re | |
from bs4 import BeautifulSoup | |
docs = Docs() | |
queries = docs.generate_search_query(question + '\n' + question_prompt) | |
print(f'Search queries: {", ".join(queries)}') | |
zot = zotero.Zotero(ZOTERO_USER_ID, 'user', ZOTERO_API_KEY) | |
searches = [zot.collection_items( | |
ZOTERO_COLLECTION_ID, | |
q=q.strip('"'), | |
limit=10, | |
itemType='attachment', | |
qmode='everything' | |
) for q in queries] | |
attachments = {item['key']: item for search in searches for item in search if item['data']['contentType'] == 'application/pdf'}.values() | |
parents = set([a['data']['parentItem'] for a in attachments]) | |
citation_dict = {p: zot.item(p, content='bib', style='american-chemical-society')[0] for p in parents} | |
result_count = len(parents) | |
if (result_count == 0): | |
print(f'No matched results in Zotero') | |
sys.exit() | |
print(f'Results: {result_count}') | |
paths = [] | |
citations = [] | |
for attachment in attachments: | |
link_mode = attachment['data']['linkMode'] | |
file_path = f'data/zotero/{attachment["key"]}.pdf' | |
parent = citation_dict[attachment['data']['parentItem']] | |
if link_mode == 'imported_file': | |
zot.dump(attachment['key'], f'{attachment["key"]}.pdf', 'data/zotero') | |
elif link_mode == 'linked_file': | |
shutil.copy(attachment['data']['path'], file_path) | |
elif link_mode == 'imported_url': | |
res = requests.get(attachment['data']['url']) | |
with open(file_path, 'wb') as f: | |
f.write(res.content) | |
else: | |
raise ValueError(f'Unsupported link mode: {link_mode} for {attachment["key"]}.') | |
paths.append(file_path) | |
citations.append(re.sub("^\(\d+\)\s+", "", BeautifulSoup(parent, 'html.parser').get_text().strip())) | |
for d, c in zip(paths, citations): | |
docs.add(d, c) | |
answer = docs.query(question) | |
with open('data/zotero-answer.txt', 'w') as f: | |
f.write(answer.formatted_answer) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is interesting, thank you. I really like the idea of optimizing literature reviews using tools like Paper QA and Zotero. These tools can greatly simplify and speed up the process of searching and analyzing scientific articles, helping you save time and improve the quality of your work. I had a similar project and I asked do my homework, I found https://edubirdie.com/do-my-homework for this. Now I know a lot about this myself and can give a lot of advice. The main thing is to use these tools effectively.