Skip to content

Instantly share code, notes, and snippets.

@gauravssnl
Forked from cronokirby/eprintdl.py
Created January 5, 2023 15:04
Show Gist options
  • Save gauravssnl/546dfe447caaefe1ccb9fb90cf9110ec to your computer and use it in GitHub Desktop.
Save gauravssnl/546dfe447caaefe1ccb9fb90cf9110ec to your computer and use it in GitHub Desktop.
Script to download papers from eprint to pdf and markdown
from bs4 import BeautifulSoup
from dataclasses import dataclass
import os
from typing import List
import re
import requests
import sys
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
CHUNK_SIZE = 1 << 12
def download_html(url: str):
r = requests.get(url, allow_redirects=True, headers=HEADERS)
return r.content
def download_pdf(url: str, destination: str):
r = requests.get(url, headers=HEADERS, stream=True)
with open(destination, "wb") as fp:
for chunk in r.iter_content(CHUNK_SIZE):
fp.write(chunk)
def split_oxford(s: str) -> List[str]:
parts = s.split(", ")
if parts[-1].startswith("and "):
parts[-1] = parts[-1][len("and ") :]
return parts
@dataclass
class PaperInfo:
url: str
title: str
authors: List[str]
abstract: str
year: str
month: str
def markdown(self):
return "\n".join(
[
f"[[{self.year}]]",
f"[[IACR ePrint]] {self.url}",
", ".join(f"[[{x}]]" for x in self.authors),
"",
"# Abstract",
"",
self.abstract,
]
)
def markdown_title(self):
return self.title.replace(": ", " - ")
def file_name(self):
return (
f'({self.year}-{self.month}) {self.title} - {", ".join(self.authors)}.pdf'
)
@staticmethod
def from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
title_el = soup.body.main.h3
title = title_el.contents[0]
# Next, find the year and month by looking for when the paper was received
received_info = soup.find(text=re.compile("received"))
if received_info is None:
raise RuntimeError("no received date for paper found")
title_siblings = title_el.find_next_siblings("p")
authors = split_oxford(title_siblings[0].contents[0].strip())
abstract = title_siblings[1].contents[0].strip()
received_info_parts = received_info.split("-")
year = received_info_parts[0]
month = received_info_parts[1]
return PaperInfo(url, title, authors, abstract, year, month)
def main(paper_url: str, markdown_folder: str, paper_folder: str):
html = download_html(paper_url)
paper = PaperInfo.from_html(paper_url, html)
print(f'Downloaded metadata for "{paper.title}"')
paper_pdf_url = paper_url + ".pdf"
markdown_path = os.path.join(markdown_folder, f"{paper.markdown_title()}.md")
print("Writing markdown to ", markdown_path)
with open(markdown_path, "w") as fp:
fp.write(paper.markdown())
paper_path = os.path.join(paper_folder, paper.file_name())
print("Saving paper to ", paper_path)
download_pdf(paper_pdf_url, os.path.join(paper_folder, paper.file_name()))
if __name__ == "__main__":
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment