Skip to content

Instantly share code, notes, and snippets.

@vzhong
Last active October 25, 2024 09:41
Show Gist options
  • Save vzhong/9d6358b8fb1c63e801cd53c01ffa1af8 to your computer and use it in GitHub Desktop.
Save vzhong/9d6358b8fb1c63e801cd53c01ffa1af8 to your computer and use it in GitHub Desktop.
Download paper to Obsidian MD
#!/usr/bin/env python
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import os
import re
import pathlib
import arxiv
import openreview
import urllib.request
OPENREVIEW_RE = re.compile(r'forum\?id=(\S+)')
ARXIV_RE = re.compile(r'abs/([0-9\.]+)')
def parse_open_review(url, dobsidian, tree, download_pdf=False):
client = openreview.Client('https://api.openreview.net')
xs = OPENREVIEW_RE.findall(url)
if not xs:
raise ValueError('Could not find ID from {}'.format(url))
x = xs[0]
paper = client.get_note(x)
title = paper.content['title']
authors = paper.content['authors']
abstract = paper.content['abstract']
preprint = paper.content['preprint'] or 'unsorted'
pdf = 'https://openreview.net/{}'.format(paper.content['pdf'].lstrip('/'))
create_entry(url, title, authors, preprint, abstract, dobsidian, tree, pdf, download_pdf=download_pdf)
def parse_arxiv(url, dobsidian, tree, download_pdf=False):
xs = ARXIV_RE.findall(url)
if not xs:
raise ValueError('Could not find ID from {}'.format(url))
x = xs[0]
search = arxiv.Search(id_list=[x])
paper = next(search.results())
url = paper.entry_id
title = paper.title
authors = [a.name for a in paper.authors]
abstract = paper.summary
preprint = 'unsorted'
pdf = paper.pdf_url + '.pdf'
create_entry(url, title, authors, preprint, abstract, dobsidian, tree, pdf, download_pdf=download_pdf)
def create_entry(url, title, authors, preprint, abstract, dobsidian, tree, pdf, download_pdf=False):
dout = pathlib.Path(dobsidian, tree)
filename = '{} --- {}.pdf'.format(','.join(authors), title)
fpdf = dout.joinpath('pdfs', filename)
if not fpdf.parent.exists():
os.makedirs(fpdf.parent)
annotation_pdf = pdf
if download_pdf:
dpdfs = dout.joinpath('pdfs')
if not dpdfs.exists():
os.makedirs(dpdfs)
response = urllib.request.urlopen(pdf)
data = response.read()
with open(dpdfs.joinpath(filename.replace(':', '')), 'wb') as f:
f.write(data)
annotation_pdf = '{}/pdfs/{}'.format(tree, filename.replace(':', ''))
lines = [
'---',
'annotation-target: {}'.format(annotation_pdf),
'---',
'',
'# {}'.format(title),
'',
'Authors',
]
for a in authors:
lines.append('- {}'.format(a))
lines.extend([
'',
'URL {}'.format(url),
'PDF {}'.format(pdf),
'',
'## Abstract',
abstract,
])
if not dout.joinpath(preprint).exists():
os.makedirs(dout.joinpath(preprint))
with dout.joinpath(preprint, '{}.md'.format(title)).open('wt') as f:
for line in lines:
f.write(line + '\n')
if __name__ == '__main__':
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('url', help='URL to download from.')
parser.add_argument('--verbose', action='store_true', help='print debug info.')
parser.add_argument('--obsidian', default='{}/notes/Research'.format(os.environ['HOME']), help='where is your Obsidian root.')
parser.add_argument('--tree', default='Papers', help='subtree of your Obsidian where papers are stored.')
parser.add_argument('--download_pdf', action='store_true', help='download a local copy of the PDF.')
args = parser.parse_args()
if args.verbose:
print('parsing {}'.format(url))
if 'openreview' in args.url:
if args.verbose:
print('parsing OpenReview')
parse_open_review(args.url, args.obsidian, args.tree, download_pdf=args.download_pdf)
elif 'arxiv' in args.url:
if args.verbose:
print('parsing Arxiv')
parse_arxiv(args.url, args.obsidian, args.tree, download_pdf=args.download_pdf)
@vzhong
Copy link
Author

vzhong commented Oct 12, 2021

You need to set the environment variables OPENREVIEW_USERNAME and OPENREVIEW_PASSWORD for OpenReview

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment