Skip to content

Instantly share code, notes, and snippets.

@cronokirby
Created June 25, 2022 21:39
Show Gist options
  • Save cronokirby/23b3d68542e1e320fd0d0a113361b124 to your computer and use it in GitHub Desktop.
Save cronokirby/23b3d68542e1e320fd0d0a113361b124 to your computer and use it in GitHub Desktop.
Script to download papers from eprint to pdf and markdown
from bs4 import BeautifulSoup
from dataclasses import dataclass
import os
from typing import List
import re
import requests
import sys
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
CHUNK_SIZE = 1 << 12
def download_html(url: str):
r = requests.get(url, allow_redirects=True, headers=HEADERS)
return r.content
def download_pdf(url: str, destination: str):
r = requests.get(url, headers=HEADERS, stream=True)
with open(destination, "wb") as fp:
for chunk in r.iter_content(CHUNK_SIZE):
fp.write(chunk)
def split_oxford(s: str) -> List[str]:
parts = s.split(", ")
if parts[-1].startswith("and "):
parts[-1] = parts[-1][len("and ") :]
return parts
@dataclass
class PaperInfo:
url: str
title: str
authors: List[str]
abstract: str
year: str
month: str
def markdown(self):
return "\n".join(
[
f"[[{self.year}]]",
f"[[IACR ePrint]] {self.url}",
", ".join(f"[[{x}]]" for x in self.authors),
"",
"# Abstract",
"",
self.abstract,
]
)
def markdown_title(self):
return self.title.replace(": ", " - ")
def file_name(self):
return (
f'({self.year}-{self.month}) {self.title} - {", ".join(self.authors)}.pdf'
)
@staticmethod
def from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
title_el = soup.body.main.h3
title = title_el.contents[0]
# Next, find the year and month by looking for when the paper was received
received_info = soup.find(text=re.compile("received"))
if received_info is None:
raise RuntimeError("no received date for paper found")
title_siblings = title_el.find_next_siblings("p")
authors = split_oxford(title_siblings[0].contents[0].strip())
abstract = title_siblings[1].contents[0].strip()
received_info_parts = received_info.split("-")
year = received_info_parts[0]
month = received_info_parts[1]
return PaperInfo(url, title, authors, abstract, year, month)
def main(paper_url: str, markdown_folder: str, paper_folder: str):
html = download_html(paper_url)
paper = PaperInfo.from_html(paper_url, html)
print(f'Downloaded metadata for "{paper.title}"')
paper_pdf_url = paper_url + ".pdf"
markdown_path = os.path.join(markdown_folder, f"{paper.markdown_title()}.md")
print("Writing markdown to ", markdown_path)
with open(markdown_path, "w") as fp:
fp.write(paper.markdown())
paper_path = os.path.join(paper_folder, paper.file_name())
print("Saving paper to ", paper_path)
download_pdf(paper_pdf_url, os.path.join(paper_folder, paper.file_name()))
if __name__ == "__main__":
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment