lifan0127 · March 8, 2023 02:53 · jasicarose75 · Feb 19, 2024
diff --git a/paper-qa-zotero.py b/paper-qa-zotero.py
 import os
 os.environ['OPENAI_API_KEY'] = '<Your OpenAI API Key>'

 # See here on how to find your Zotero info: https://github.com/urschrei/pyzotero#quickstart
 ZOTERO_USER_ID = '<Your Zotero User ID>'
 ZOTERO_API_KEY = '<Your Zotero API Key>'
 ZOTERO_COLLECTION_ID = '<Your Zotero Collection ID>'

 question = 'What predictive models are used in materials discovery?'
 # The following prompt instruction is injected to limit the number of keywords per query
 question_prompt = 'A "keyword search" is a list of no more than 3 words, which separated by whitespace only and with no boolean operators (e.g. "dog canine puppy"). Avoid adding any new words not in the question unless they are synonyms to the existing words.'

 from paperqa import Docs
 from pyzotero import zotero
 import requests
 import shutil, sys, re
 from bs4 import BeautifulSoup

 docs = Docs()

 queries = docs.generate_search_query(question + '\n' + question_prompt)
 print(f'Search queries: {", ".join(queries)}')

 zot = zotero.Zotero(ZOTERO_USER_ID, 'user', ZOTERO_API_KEY)

 searches = [zot.collection_items(
    ZOTERO_COLLECTION_ID, 
    q=q.strip('"'),
    limit=10, 
    itemType='attachment', 
    qmode='everything'
  ) for q in queries]
 attachments = {item['key']: item for search in searches for item in search if item['data']['contentType'] == 'application/pdf'}.values()

 parents = set([a['data']['parentItem'] for a in attachments])
 citation_dict = {p: zot.item(p, content='bib', style='american-chemical-society')[0] for p in parents}
 result_count = len(parents)

 if (result_count == 0):
    print(f'No matched results in Zotero')
    sys.exit()
 print(f'Results: {result_count}')

 paths = []
 citations = []

 for attachment in attachments:
    link_mode = attachment['data']['linkMode']
    file_path = f'data/zotero/{attachment["key"]}.pdf'
    parent = citation_dict[attachment['data']['parentItem']]
    if link_mode == 'imported_file':
        zot.dump(attachment['key'], f'{attachment["key"]}.pdf', 'data/zotero')
    elif link_mode == 'linked_file':
        shutil.copy(attachment['data']['path'], file_path)
    elif link_mode == 'imported_url':
        res = requests.get(attachment['data']['url'])
        with open(file_path, 'wb') as f:
          f.write(res.content)
    else:
        raise ValueError(f'Unsupported link mode: {link_mode} for {attachment["key"]}.')
    paths.append(file_path)
    citations.append(re.sub("^\(\d+\)\s+", "", BeautifulSoup(parent, 'html.parser').get_text().strip()))

 for d, c in zip(paths, citations):
    docs.add(d, c)

 answer = docs.query(question)
 with open('data/zotero-answer.txt', 'w') as f:
    f.write(answer.formatted_answer)
	import os
	os.environ['OPENAI_API_KEY'] = '<Your OpenAI API Key>'

	# See here on how to find your Zotero info: https://github.com/urschrei/pyzotero#quickstart
	ZOTERO_USER_ID = '<Your Zotero User ID>'
	ZOTERO_API_KEY = '<Your Zotero API Key>'
	ZOTERO_COLLECTION_ID = '<Your Zotero Collection ID>'

	question = 'What predictive models are used in materials discovery?'
	# The following prompt instruction is injected to limit the number of keywords per query
	question_prompt = 'A "keyword search" is a list of no more than 3 words, which separated by whitespace only and with no boolean operators (e.g. "dog canine puppy"). Avoid adding any new words not in the question unless they are synonyms to the existing words.'

	from paperqa import Docs
	from pyzotero import zotero
	import requests
	import shutil, sys, re
	from bs4 import BeautifulSoup

	docs = Docs()

	queries = docs.generate_search_query(question + '\n' + question_prompt)
	print(f'Search queries: {", ".join(queries)}')

	zot = zotero.Zotero(ZOTERO_USER_ID, 'user', ZOTERO_API_KEY)

	searches = [zot.collection_items(
	ZOTERO_COLLECTION_ID,
	q=q.strip('"'),
	limit=10,
	itemType='attachment',
	qmode='everything'
	) for q in queries]
	attachments = {item['key']: item for search in searches for item in search if item['data']['contentType'] == 'application/pdf'}.values()

	parents = set([a['data']['parentItem'] for a in attachments])
	citation_dict = {p: zot.item(p, content='bib', style='american-chemical-society')[0] for p in parents}
	result_count = len(parents)

	if (result_count == 0):
	print(f'No matched results in Zotero')
	sys.exit()
	print(f'Results: {result_count}')

	paths = []
	citations = []

	for attachment in attachments:
	link_mode = attachment['data']['linkMode']
	file_path = f'data/zotero/{attachment["key"]}.pdf'
	parent = citation_dict[attachment['data']['parentItem']]
	if link_mode == 'imported_file':
	zot.dump(attachment['key'], f'{attachment["key"]}.pdf', 'data/zotero')
	elif link_mode == 'linked_file':
	shutil.copy(attachment['data']['path'], file_path)
	elif link_mode == 'imported_url':
	res = requests.get(attachment['data']['url'])
	with open(file_path, 'wb') as f:
	f.write(res.content)
	else:
	raise ValueError(f'Unsupported link mode: {link_mode} for {attachment["key"]}.')
	paths.append(file_path)
	citations.append(re.sub("^\(\d+\)\s+", "", BeautifulSoup(parent, 'html.parser').get_text().strip()))

	for d, c in zip(paths, citations):
	docs.add(d, c)

	answer = docs.query(question)
	with open('data/zotero-answer.txt', 'w') as f:
	f.write(answer.formatted_answer)