-
-
Save gauravssnl/546dfe447caaefe1ccb9fb90cf9110ec to your computer and use it in GitHub Desktop.
Script to download papers from eprint to pdf and markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from dataclasses import dataclass | |
import os | |
from typing import List | |
import re | |
import requests | |
import sys | |
HEADERS = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" | |
} | |
CHUNK_SIZE = 1 << 12 | |
def download_html(url: str): | |
r = requests.get(url, allow_redirects=True, headers=HEADERS) | |
return r.content | |
def download_pdf(url: str, destination: str): | |
r = requests.get(url, headers=HEADERS, stream=True) | |
with open(destination, "wb") as fp: | |
for chunk in r.iter_content(CHUNK_SIZE): | |
fp.write(chunk) | |
def split_oxford(s: str) -> List[str]: | |
parts = s.split(", ") | |
if parts[-1].startswith("and "): | |
parts[-1] = parts[-1][len("and ") :] | |
return parts | |
@dataclass | |
class PaperInfo: | |
url: str | |
title: str | |
authors: List[str] | |
abstract: str | |
year: str | |
month: str | |
def markdown(self): | |
return "\n".join( | |
[ | |
f"[[{self.year}]]", | |
f"[[IACR ePrint]] {self.url}", | |
", ".join(f"[[{x}]]" for x in self.authors), | |
"", | |
"# Abstract", | |
"", | |
self.abstract, | |
] | |
) | |
def markdown_title(self): | |
return self.title.replace(": ", " - ") | |
def file_name(self): | |
return ( | |
f'({self.year}-{self.month}) {self.title} - {", ".join(self.authors)}.pdf' | |
) | |
@staticmethod | |
def from_html(url, html): | |
soup = BeautifulSoup(html, "html.parser") | |
title_el = soup.body.main.h3 | |
title = title_el.contents[0] | |
# Next, find the year and month by looking for when the paper was received | |
received_info = soup.find(text=re.compile("received")) | |
if received_info is None: | |
raise RuntimeError("no received date for paper found") | |
title_siblings = title_el.find_next_siblings("p") | |
authors = split_oxford(title_siblings[0].contents[0].strip()) | |
abstract = title_siblings[1].contents[0].strip() | |
received_info_parts = received_info.split("-") | |
year = received_info_parts[0] | |
month = received_info_parts[1] | |
return PaperInfo(url, title, authors, abstract, year, month) | |
def main(paper_url: str, markdown_folder: str, paper_folder: str): | |
html = download_html(paper_url) | |
paper = PaperInfo.from_html(paper_url, html) | |
print(f'Downloaded metadata for "{paper.title}"') | |
paper_pdf_url = paper_url + ".pdf" | |
markdown_path = os.path.join(markdown_folder, f"{paper.markdown_title()}.md") | |
print("Writing markdown to ", markdown_path) | |
with open(markdown_path, "w") as fp: | |
fp.write(paper.markdown()) | |
paper_path = os.path.join(paper_folder, paper.file_name()) | |
print("Saving paper to ", paper_path) | |
download_pdf(paper_pdf_url, os.path.join(paper_folder, paper.file_name())) | |
if __name__ == "__main__": | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment