Skip to content

Instantly share code, notes, and snippets.

Created August 29, 2017 04:55
Show Gist options
  • Save andrewfowlie/4e556851384c35f2031b080341f70d19 to your computer and use it in GitHub Desktop.
Save andrewfowlie/4e556851384c35f2031b080341f70d19 to your computer and use it in GitHub Desktop.
Convert arXiv webpage into markdown entry for webpage in Hugo academic theme
Convert arXiv code into markdown entry for webpage in Hugo academic theme
Main usuage e.g.,
>>> markdown('1607.06608')
to scrape into a publication for Hugo
academic theme.
import requests
import datetime
import collections
import sys
import re
from lxml import html
URL_DOI = "{}"
URL_ARXIV = "{}"
DATE_ARXIV = "%d %b %Y"
DATE_MD = "%Y-%m-%d"
FETCH = collections.namedtuple('arXiv', 'date title authors abstract url_pdf url_doi')
abstract = {abstract}
abstract_short = ""
authors = {authors}
date = {date}
image_preview = ""
math = true
publication_types = ["2"]
publication = ""
publication_short = ""
selected = false
title = {title}
url_code = ""
url_dataset = ""
url_pdf = {url_pdf}
url_project = ""
url_slides = ""
url_video = ""
name = "DOI"
url = {url_doi}
def wrap_quote(item):
:returns: Argument strippted of trailing spaces and wrapped in quotation marks
return '"{}"'.format(str(item).strip())
def parse_date(date):
:returns: Date in format for Hugo academic theme
prefix = '(Submitted on'
date = date[len(prefix):].strip("() ")
date = datetime.datetime.strptime(date, DATE_ARXIV).strftime(DATE_MD)
return wrap_quote(date)
def parse_author(author):
:returns: Author in format Firstname Lastname
names = author.split(",")
names = [n.strip() for n in names]
author = "{} {}".format(*names[::-1])
return author
def parse_tex(tex):
:returns: LaTeX code but with special characters escaped for markdown
tex = re.sub("\\\\", "\\\\\\\\", tex)
tex = re.sub("_", "\\\\\\\\_", tex)
tex = re.sub("\n", " ", tex)
return wrap_quote(tex)
def parse_doi(url_doi):
:returns: URL for DOI, if there is one
if url_doi:
return wrap_quote(URL_DOI.format(url_doi.strip()))
return '""'
def fetch_raw(code):
:param code: arXiv code, e.g. 1607.06608
:returns: Data scraped from arXiv page
url_arxiv = URL_ARXIV.format(code)
page = requests.get(url_arxiv)
tree = html.fromstring(page.content)
date = tree.xpath('//div[@class="dateline"]/text()')[0]
title = tree.xpath('//h1[@class="title mathjax"]/text()')[0]
abstract = tree.xpath('//blockquote[@class="abstract mathjax"]/text()')[-1]
url_pdf = tree.xpath('//meta[@name="citation_pdf_url"]')[0].items()[1][1]
authors = [a.items()[1][1] for a in tree.xpath('//meta[@name="citation_author"]')]
url_doi = tree.xpath('//meta[@name="citation_doi"]')[0].items()[1][1]
except IndexError:
url_doi = ""
return FETCH(date, title, authors, abstract, url_pdf, url_doi)
def fetch_markdown(code):
:param code: arXiv code, e.g. 1607.06608
:returns: Data scraped from arXiv page in markdown format
data = fetch_raw(code)
date = parse_date(
title = parse_tex(data.title)
authors = str([parse_author(a) for a in data.authors])
abstract = parse_tex(data.abstract)
url_pdf = wrap_quote(data.url_pdf)
url_doi = parse_doi(data.url_doi)
return FETCH(date, title, authors, abstract, url_pdf, url_doi)
def markdown(code):
:param code: arXiv code, e.g. 1607.06608
:returns: Markdown entry for publication in Hugo academic theme
data = fetch_markdown(code)
return TEMPLATE.format(**data._asdict())
if __name__ == "__main__":
code = sys.argv[1]
print markdown(code)
Copy link

@gcushen users of your great theme with publications on the arXiv may find this useful. I used it to scrape the data from my publications into my markdown entries for my webpage.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment