Created
December 5, 2020 18:40
-
-
Save goerz/dd4e50862348949f8c05fce4596eb13b to your computer and use it in GitHub Desktop.
Python script for generating markdown/Obsidian summaries of arXiv files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Create markdown summaries from arXiv identifiers. | |
Test with:: | |
pytest arxiv_summarize.py -s -x --doctest-modules | |
See ``--help`` for usage summary:: | |
python arxiv_summarize.py --help | |
""" | |
# MIT License | |
# | |
# Copyright (c) 2020 Michael Goerz | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
import collections | |
import itertools | |
import logging | |
import os | |
import re | |
import select | |
import sys | |
import unicodedata | |
from pathlib import Path | |
from textwrap import indent | |
import arxiv # https://github.com/lukasschwab/arxiv.py | |
import click | |
from click.testing import CliRunner | |
__version__ = '0.1.0' | |
# fmt: off | |
# These are all the primary categories present in the OAI ArXiv metadata | |
CATEGORIES = [ | |
"acc-phys", "adap-org", "alg-geom", "ao-sci", "astro-ph", "atom-ph", | |
"bayes-an", "chao-dyn", "chem-ph", "cmp-lg", "comp-gas", "cond-mat", "cs", | |
"dg-ga", "funct-an", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th", | |
"math", "math-ph", "mtrl-th", "nlin", "nucl-ex", "nucl-th", "patt-sol", | |
"physics", "plasm-ph", "q-alg", "q-bio", "quant-ph", "solv-int", | |
"supr-con", "eess", "econ", "q-fin", "stat" | |
] | |
# All subcategories with more than 2 capital letters (not SG, SI, SP, etc) | |
SUB_CATEGORIES = [ | |
'acc-ph', 'ao-ph', 'app-ph', 'atm-clus', 'atom-ph', 'bio-ph', 'chem-ph', | |
'class-ph', 'comp-ph', 'data-an', 'dis-nn', 'ed-ph', 'flu-dyn', 'gen-ph', | |
'geo-ph', 'hist-ph', 'ins-det', 'med-ph', 'mes-hall', 'mtrl-sci', | |
'optics', 'other', 'plasm-ph', 'pop-ph', 'quant-gas', 'soc-ph', 'soft', | |
'space-ph', 'stat-mech', 'str-el', 'supr-con' | |
] | |
# fmt: on | |
# Regexes adapted from | |
# https://github.com/mattbierbaum/arxiv-public-datasets/blob/master/arxiv_public_data/regex_arxiv.py | |
RE_CATEGORIES = r'(?:{})(?:(?:[.][A-Z]{{2}})|(?:{}))?'.format( | |
r'|'.join(CATEGORIES), r'|'.join(SUB_CATEGORIES) | |
) | |
RE_DATE = r'(?:\d{2}[01]\d)' # YYMM | |
RE_VERSION = r'(?:v[1-9]\d*)?' | |
RE_NUM_NEW = r'(?:\d{4,5})' | |
RE_NUM_OLD = r'(?:\d{3})' | |
# matches: 1612.00001 1203.0023v2 | |
RE_ID_NEW = r'(?:{date}\.{number}{version})'.format( | |
date=RE_DATE, number=RE_NUM_NEW, version=RE_VERSION | |
) | |
RX_ID_NEW = re.compile( | |
r'(?P<date>{date})\.(?P<number>{number})(?P<version>{version})'.format( | |
date=RE_DATE, number=RE_NUM_NEW, version=RE_VERSION | |
) | |
) | |
# matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2 | |
RE_ID_OLD = r'(?:{cat}/{date}{number}{version})'.format( | |
cat=RE_CATEGORIES, date=RE_DATE, number=RE_NUM_OLD, version=RE_VERSION | |
) | |
RX_ID_OLD = re.compile( | |
r'(?P<cat>{cat})/(?P<date>{date})(?P<number>{number})(?P<version>{version})'.format( | |
cat=RE_CATEGORIES, date=RE_DATE, number=RE_NUM_OLD, version=RE_VERSION | |
) | |
) | |
RX_ID = re.compile(r'(?:%s|%s)' % (RE_ID_OLD, RE_ID_NEW)) | |
class ArxivID: | |
"""An arXiv ID. | |
This wraps around strings that are valid arXiv IDs in the format described | |
in https://arxiv.org/help/arxiv_identifier. Both "old-style" IDs | |
(submissions up to March 2007), e.g.:: | |
>>> id = ArxivID("math.GT/0309136") | |
>>> id.date | |
'0309' | |
>>> id.year | |
2003 | |
>>> id.month | |
9 | |
>>> id.num | |
136 | |
>>> id.number | |
'136' | |
>>> id.version # None | |
>>> id.category | |
'math.GT' | |
>>> id.archive | |
'math' | |
>>> id.subject | |
'GT' | |
and "new-style" IDs (submissions after March 2007), e.g.:: | |
>>> id = ArxivID("1501.00001v1") | |
>>> id.date | |
'1501' | |
>>> id.year | |
2015 | |
>>> id.month | |
1 | |
>>> id.num | |
1 | |
>>> id.number | |
'00001' | |
>>> id.version | |
1 | |
>>> id.category # None | |
>>> id.archive # None | |
>>> id.subject # None | |
are supported. There is no guarantee that an instantiated ArxivID actually | |
exists, only that is has the correct format. | |
The IDs are converted to a canonical format, e.g. by padding the article | |
number with the appropriate number of zeros (4 digits before 2015, 5 digits | |
starting from 2015):: | |
>>> ArxivID("0706.00001v2") # extra '0' | |
ArxivID('0706.0001v2') | |
An ArxivID may be used in a string context:: | |
>>> print("arXiv:%s" % ArxivID("1501.00001v1")) | |
arXiv:1501.00001v1 | |
Two IDs compare as equal if they canonically refer to the same submission. | |
For old-style IDs, the subject-identifier is not taken into account:: | |
>>> ArxivID("math.GT/0309136") == "math/0309136" | |
True | |
Also, an arXiv ID that does not contain an explicit version identifier | |
compares as equal to all versions of the same submission. | |
>>> ArxivID('1501.00001') == ArxivID('1501.00001v1') | |
True | |
>>> ArxivID('1501.00001') == ArxivID('1501.00001v2') | |
True | |
Args: | |
string (str): String from which to extract the ID | |
search (bool): If True, search for the first valid arXiv ID in | |
`string`. If False `string` must be a valid arXiv ID by itself. | |
Attributes: | |
date (str): The year and month of the arXiv submission in the format | |
"YYMM" | |
year (int): The full 4-digit integer year of the arXiv submission | |
month (int): The month of the arXiv submission | |
num (int): The number of the arXiv submission, as an integer | |
number (str): The number of the arXiv submission as a string with the | |
canonical zero-padding (3 digits up to March 2007, 4 digits up to | |
December 2014, 5 digits for anything newer) | |
version (int or None): The version number of the submission. A value of | |
None refers to the "latest" version. | |
category (str or None): The full "category", e.g. "math.GT". None for | |
new-style IDs (since March 2007). | |
archive (str or None): The archive name, e.g. "math". May be identical | |
to the `category` when the `subject` is redundant or optional. None | |
for new-style IDs (since March 2007). | |
subject (str or None): Optional subject identifier, e.g. "GT" | |
Raises: | |
ValueError: If `string` does not contain a valid arXiv ID. | |
""" | |
def __init__(self, string, search=True): | |
parse_mtd = 'match' | |
if search: | |
parse_mtd = 'search' | |
match_new = getattr(RX_ID_NEW, parse_mtd)(str(string)) | |
match_old = getattr(RX_ID_OLD, parse_mtd)(str(string)) | |
if match_new: | |
self._style = 'new' | |
self.date = match_new.group('date') | |
self.year = 2000 + int(self.date[0:2]) | |
self.month = int(self.date[2:4]) | |
self.num = int(match_new.group('number')) | |
if self.year <= 2012: | |
self.number = "%04d" % self.num | |
else: | |
self.number = "%05d" % self.num | |
try: | |
self.version = int(match_new.group('version')[1:]) | |
except (TypeError, ValueError): | |
self.version = None | |
self.category = None | |
self.archive = None | |
self.subject = None | |
elif match_old: | |
self._style = 'old' | |
self.date = match_old.group('date') | |
if self.date[0] == '0': | |
self.year = 2000 + int(self.date[0:2]) | |
else: | |
self.year = 1900 + int(self.date[0:2]) | |
self.month = int(self.date[2:4]) | |
self.num = int(match_old.group('number')) | |
self.number = "%03d" % self.num | |
try: | |
self.version = int(match_old.group('version')[1:]) | |
except (TypeError, ValueError): | |
self.version = None | |
self.category = match_old.group('cat') | |
self.archive = self.category.split(".")[0] | |
self.subject = None | |
if "." in self.category: | |
self.subject = self.category.split(".")[-1] | |
else: | |
raise ValueError("Invalid arXiv ID: %r" % string) | |
def __str__(self): | |
v = "" | |
if self.version is not None: | |
v = "v%d" % self.version | |
if self._style == 'new': | |
return "%s.%s%s" % (self.date, self.number, v) | |
else: | |
return "%s/%s%s%s" % (self.category, self.date, self.number, v) | |
def __repr__(self): | |
return "%s(%r)" % (self.__class__.__name__, str(self)) | |
def __eq__(self, other): | |
if not isinstance(other, self.__class__): | |
other = self.__class__(other) | |
if self.version is not None and other.version is not None: | |
if self.version != other.version: | |
return False | |
attribs = ['date', 'num', 'archive'] | |
# we don't compare the subject, as that seems to be non-canonical | |
return all([getattr(self, a) == getattr(other, a) for a in attribs]) | |
@classmethod | |
def find_all(cls, string): | |
"""Find arXiv IDs in the given `string`. | |
The IDs are returned as a list in the order that they appear in | |
`string`, with duplicated removed. | |
""" | |
if sys.version_info < (3, 7, 0): | |
logger = logging.getLogger(__name__) | |
logger.warning( | |
"IDs may not be sorted propery. Run on Python >=3.7." | |
) | |
return [ | |
cls(k) for k in collections.Counter(RX_ID.findall(string)).keys() | |
] | |
MKD_FMT = r''' | |
{title_prefix}{title} | |
{authors_and_separated} | |
{arxiv_url} | |
{tags} | |
{abstract} | |
{links} | |
'''.lstrip() | |
def flatten(list_): | |
"""Flatten the given list.""" | |
flat_list = [] | |
for element in list_: | |
if isinstance(element, (list, tuple)): | |
flat_list += flatten(element) | |
else: | |
flat_list.append(element) | |
return flat_list | |
def make_filename(string, allow_unicode=False, slugify=False): | |
"""Turn `string` into a "safe" filename. | |
Convert to ASCII if 'allow_unicode' is False. Convert slashes ("/") to | |
hyphens ("-"). Remove characters that aren't alphanumerics, underscores, | |
hyphens, spaces, or periods, and strip leading and trailing whitespace. If | |
`slugify` is True, also convert to lowercase, convert spaces to | |
hyphens and strip out most punctuation marks. | |
""" | |
string = str(string) | |
if allow_unicode: | |
string = unicodedata.normalize('NFKC', string) | |
else: | |
string = ( | |
unicodedata.normalize('NFKD', string) | |
.encode('ascii', 'ignore') | |
.decode('ascii') | |
) | |
string = string.replace("/", "-") | |
string = re.sub(r'[^\w\s()\[\],."\'?+-]', '', string).strip() | |
if slugify: | |
string = string.lower() | |
string = re.sub(r'[()\[\],"\'?+]', '', string) | |
string = re.sub(r'[-\s]+', '-', string) | |
return string | |
def _sanitize_single_line(string): | |
string = string.replace("\n", " ") | |
string = re.sub(r'\s{2,}', ' ', string) | |
return string | |
def _sanitize_paragraphs(string): | |
"""Remove unnecessary line breaks.""" | |
rx = re.compile( | |
r''' | |
(?<!\A) # not at the beginning of string | |
(?<!\n|\s) # not preceded by another newline or whitespace | |
\n | |
(?!\n|\s|\Z) # not followed by another newline, whitespace, or | |
# end of string | |
''', | |
re.X, | |
) | |
string = " ".join(rx.split(string)) | |
return string | |
def _tags(query_result): | |
"""Return a list of tags based on the tags in the arXiv `query_result`.""" | |
return [ | |
"#%s" % tag | |
for tag in sorted( | |
set( | |
flatten( | |
['arxiv'] | |
+ [tag.term.split(".") for tag in query_result.tags] | |
) | |
) | |
) | |
] | |
def _links(query_result): | |
"""Return a list of markdown-formatted links for the `query_result`. | |
This includes only titled links, which will usually be the PDF and the DOI | |
link to any published version of the article. | |
""" | |
return [ | |
"[%s](%s)" % (link.title, link.href) | |
for link in query_result.links | |
if 'title' in link | |
] | |
def arxiv_fmt_replacements(query_id, query_result, header_level=2): | |
"""Return replacements for formatting a string with arXiv query results. | |
The result is a dictionary with replacement to be used in a string's | |
`format` method. | |
The resulting dict contains the following keys: | |
* `title_prefix`: Markdown title prefix for a header of level | |
`header_level`, e.g. "## " for ``header_level=2``. | |
* `arxiv_comment`: The comment field of the arXiv submission; typically the | |
number of pages. | |
* `arxiv_url`: The URL of the abstract on arxiv.org | |
* `doi`: The DOI of the paper if available, or None | |
* `journal_reference`: The Journal reference of the paper if available, or | |
None | |
* `pdf_url`: The URL for the PDF file on arxiv.org | |
* `published`: The date on which the arXiv version was originally published | |
* `abstract`: The abstract text | |
* `title`: The title of the submission | |
* `updated`: The date on which the arXiv version was last updated | |
* `authors`: List of author names | |
* `authors_and_separated`: A string containing all authors separated by | |
"and" | |
* `authors_comma_separated`: A string containing all authors separated by | |
commas | |
* `tags`: A string containing a list of tags separated by spaces, including | |
"arxiv" and tags based on the arXiv submission categories | |
* `links`: A string containing markdown-formatted links to the PDF and the | |
published version of the article, if available. | |
* `id`: The canonical arXiv ID, as reported by the arXiv API | |
* `query_id`: The arXiv ID as searched for (value of the `query_id` | |
argument) | |
""" | |
keys = [ | |
'arxiv_comment', | |
'arxiv_url', | |
'doi', | |
'journal_reference', | |
'pdf_url', | |
'published', | |
'updated', | |
'authors', | |
] | |
replacements = {k: query_result.get(k, None) for k in keys} | |
replacements.update( | |
dict( | |
title_prefix="#" * header_level + " ", | |
title=_sanitize_single_line(query_result.title), | |
authors_and_separated=" and ".join( | |
[ | |
name.replace(" and ", " {and} ") | |
for name in query_result.authors | |
] | |
), | |
authors_comma_separated=", ".join(query_result.authors), | |
abstract=_sanitize_paragraphs(query_result.summary), | |
tags=" ".join(_tags(query_result)), | |
links=" ".join(_links(query_result)), | |
query_id=query_id, | |
id=str(ArxivID(query_result.id)), | |
) | |
) | |
return replacements | |
def _arxiv_query(id_list): | |
"""Wrapper around `arxiv.query` to deal with missing IDs.""" | |
responses = arxiv.query(id_list=[str(id) for id in id_list]) | |
if len(responses) == len(id_list): # all IDs found | |
for (id, response) in zip(id_list, responses): | |
# I can't find anywhere that the arXiv API *guarantees" that the | |
# results are returned in the same order as the query IDs, so we'll | |
# double check | |
assert ArxivID(id) == ArxivID(response.id) | |
return id_list, responses | |
else: # not all IDs found | |
response_cycle = itertools.cycle(responses) | |
responses_filtered = [] | |
id_list_filtered = [] | |
for id in id_list: | |
attempts = len(responses) | |
while attempts > 0: | |
response = next(response_cycle) | |
if ArxivID(id) == ArxivID(response.id): | |
id_list_filtered.append(id) | |
responses_filtered.append(response) | |
break | |
attempts -= 1 | |
else: | |
logger = logging.getLogger(__name__) | |
logger.error("Cannot find ID %s", id) | |
return id_list_filtered, responses_filtered | |
def arxiv_to_markdown( | |
*arxiv_ids, | |
indent_level=0, | |
block_spacing=2, | |
header_level=2, | |
write_to_files=None, | |
slugify=True, | |
append=True, | |
obsidian=False, | |
): | |
"""Given a list of ArxivIDs, generate and return markdown summaries. | |
Each ID will be rendered into a "block". Each block will start with the | |
title of the paper, formatted as a markdown header with the given | |
`header_level`. Multiple blocks that are written to the same output file | |
or returned as a result are separated by `block_spacing` number of empty | |
lines. | |
If `write_to_files` is given, the block for each ID is written to a | |
filename based on `write_to_files`. The filename is obtained by formatting | |
`write_to_files` with replacements, e.g. "{id}" is replaced by the ID and | |
"{title}" is replaced by the manuscript title, and | |
"{authors_and_separated}" is | |
replaced by a list of the author names. Other fields are those available in | |
the arXiv API response record. The filename is then sanitized through | |
:func:`make_filename`. If `slugify` is True, non-ASCII letters are dropped | |
from the filename, spaces are replaced by hyphens, and the filename is | |
converted to lowercase. A summary bullet list in markdown format linking | |
to the written files is generated and returned as a multi-line string | |
result. IF `obsidian` is True, a special syntax appropriate for the | |
Obsidian note-taking software will be used. | |
If `write_to_files` is None, the blocks for multiple IDs will be | |
concatenated (with `block_spacing`) and returned as a multi-line string | |
result. | |
In either case, the returned result will be indented with 4 spaces per | |
`indent_level`. | |
""" | |
logger = logging.getLogger(__name__) | |
arxiv_ids, responses = _arxiv_query(id_list=arxiv_ids) | |
prefix = "" | |
if write_to_files is None: | |
prefix = " " * indent_level | |
separator = "\n" * block_spacing | |
blocks = [] | |
block_replacements = [] | |
for (query_id, response) in zip(arxiv_ids, responses): | |
replacements = arxiv_fmt_replacements( | |
query_id, response, header_level=header_level | |
) | |
block_replacements.append(replacements) | |
blocks.append(indent(MKD_FMT.format(**replacements), prefix)) | |
outfiles = set() # existing files | |
if write_to_files is None: | |
return separator.join(blocks) | |
else: # write blocks to file, generate and return summary | |
logger.debug("write_to_files = %s", write_to_files) | |
summary_items = [] | |
tuples = tuple(zip(arxiv_ids, responses, block_replacements, blocks)) | |
for (id, response, replacements, block) in tuples: | |
outfile = Path( | |
make_filename( | |
write_to_files.format(**replacements), | |
allow_unicode=(not slugify), | |
slugify=slugify, | |
) | |
) | |
logger.debug("Writing block for %s to %s", id, outfile) | |
if append and outfile.is_file(): | |
outfiles.add(outfile) | |
elif outfile not in outfiles: | |
outfile.unlink(missing_ok=True) | |
with outfile.open(mode="a", encoding="utf8") as out_fh: | |
if outfile in outfiles: | |
out_fh.write(separator) | |
out_fh.write(block) | |
outfiles.add(outfile) | |
replacements['outfile'] = outfile | |
replacements['outfile_stem'] = outfile.stem | |
if obsidian: | |
summary_items.append( | |
"* [[{outfile_stem}|{title}]]".format(**replacements) | |
) | |
else: | |
summary_items.append( | |
"* [{title}]({outfile})".format(**replacements) | |
) | |
prefix = " " * indent_level | |
return indent("\n".join(summary_items), prefix) + "\n" | |
TEST_OUTPUT = r""" | |
## Prediction of Toric Code Topological Order from Rydberg Blockade | |
Ruben Verresen and Mikhail D. Lukin and Ashvin Vishwanath | |
http://arxiv.org/abs/2011.12310v1 | |
#arxiv #atom-ph #cond-mat #physics #quant-gas #quant-ph #str-el | |
The physical realization of $\mathbb Z_2$ topological order as encountered in the paradigmatic toric code has proven to be an elusive goal. We show that this phase of matter can be created in a two-dimensional array of strongly interacting Rydberg atoms. Our proposal makes use of atoms localized on the sites of a ruby lattice, coupled via a Rydberg blockade mechanism. First, we show that the blockade model effectively realizes a monomer-dimer model on the kagome lattice with a single-site kinetic term, and we obtain its phase diagram using the numerical density matrix renormalization group method. We find a topological quantum liquid (TQL) as evidenced by multiple measures including (i) a continuous transition between two featureless phases, (ii) a topological entanglement entropy of $\ln 2$ as measured in various geometries, (iii) degenerate topological ground states and (iv) the expected modular matrix from ground state overlap. Next, we show that the TQL can persist upon including realistic, algebraically-decaying van der Waals interactions $V(r) \sim 1/r^6$. Moreover, we can directly access the topological loop operators of this model, which can be measured experimentally using a dynamic protocol, providing a "smoking gun" experimental signature of the TQL phase. Finally, we show how to trap an emergent anyon and realize different topological boundary conditions, and we discuss the implications for exploring fault-tolerant quantum memories. | |
[pdf](http://arxiv.org/pdf/2011.12310v1) | |
## Krotov: A Python implementation of Krotov's method for quantum optimal control | |
Michael H. Goerz and Daniel Basilewitsch and Fernando Gago-Encinas and Matthias G. Krauss and Karl P. Horn and Daniel M. Reich and Christiane P. Koch | |
http://arxiv.org/abs/1902.11284v6 | |
#arxiv #quant-ph | |
We present a new open-source Python package, krotov, implementing the quantum optimal control method of that name. It allows to determine time-dependent external fields for a wide range of quantum control problems, including state-to-state transfer, quantum gate implementation and optimization towards an arbitrary perfect entangler. Krotov's method compares to other gradient-based optimization methods such as gradient-ascent and guarantees monotonic convergence for approximately time-continuous control fields. The user-friendly interface allows for combination with other Python packages, and thus high-level customization. The package is being developed at https://github.com/qucontrol/krotov | |
[doi](http://dx.doi.org/10.21468/SciPostPhys.7.6.080) [pdf](http://arxiv.org/pdf/1902.11284v6) | |
""".lstrip() | |
def test_arxiv_to_markdown(): | |
"""Test of `arxiv_to_markdown` function.""" | |
md = arxiv_to_markdown('2011.12310', '1902.11284') | |
assert md[-1] == "\n" | |
assert md == TEST_OUTPUT | |
def test_arxiv_id_regex(): | |
"""Test the regexes for extracting arXiv identifiers.""" | |
for id in ['1612.00001', '1203.0023v2']: | |
assert re.match(RE_ID_NEW, id) | |
assert RX_ID_NEW.match(id) | |
for id in ['hep-th/11030234', 'cs/0112345v2', 'cs.AI/0112345v2']: | |
assert re.match(RE_ID_OLD, id) | |
assert RX_ID_OLD.match(id) | |
input = ( | |
"http://arxiv.org/abs/1902.11284v6 2011.12310 arxiv:math.GT/0309136" | |
) | |
arxiv_ids = RX_ID.findall(input) | |
assert arxiv_ids == ['1902.11284v6', '2011.12310', 'math.GT/0309136'] | |
def test_arxiv_id_extraction(): | |
"""Test that duplicate IDs are excluded in arxiv_ids.""" | |
input = "http://arxiv.org/abs/1902.11284v6,1203.0023v2,1902.11284v6 1203.0023v2 1612.00001 cs/0112345v2" | |
assert ArxivID.find_all(input) == [ | |
ArxivID("1902.11284v6"), | |
ArxivID("1203.0023v2"), | |
ArxivID("1612.00001"), | |
ArxivID("cs/0112345v2"), | |
] | |
def test_write_to_files(caplog): | |
runner = CliRunner(mix_stderr=False) | |
with runner.isolated_filesystem(): | |
result = runner.invoke( | |
main, | |
[ | |
'--debug', | |
'--write-to-files', | |
'abstracts.md', | |
'--indent-level', | |
'2', | |
], | |
input='2011.12310v1 1902.11284v6', | |
) | |
assert result.exit_code == 0 | |
files = list(Path(".").glob("*.*")) | |
assert Path("abstracts.md") in files | |
abstracts_md = Path("abstracts.md").read_text() | |
expected = """\ | |
* [Prediction of Toric Code Topological Order from Rydberg Blockade](abstracts.md) | |
* [Krotov: A Python implementation of Krotov's method for quantum optimal control](abstracts.md)\n""" | |
assert result.output == expected | |
assert abstracts_md == TEST_OUTPUT | |
with runner.isolated_filesystem(): | |
result = runner.invoke( | |
main, | |
[ | |
'--debug', | |
'--write-to-files', | |
'{query_id}.md', | |
'--indent-level', | |
'2', | |
], | |
input='2011.12310v1 1902.11284v6 cs/0112345v2 math.GT/0309136', | |
# note: cs/0112345v2 does not exist | |
) | |
files = list(Path(".").glob("*.*")) | |
assert result.exit_code == 0 | |
error_msgs = [ | |
x.message | |
for x in caplog.get_records(when="call") | |
if x.levelno == logging.ERROR | |
] | |
assert result.exit_code == 0 | |
assert len(error_msgs) == 1 | |
assert error_msgs[0] == "Cannot find ID cs/0112345v2" | |
assert Path('2011.12310v1.md') in files | |
assert Path('1902.11284v6.md') in files | |
assert Path('math.GT-0309136.md') in files | |
expected = """\ | |
* [Prediction of Toric Code Topological Order from Rydberg Blockade](2011.12310v1.md) | |
* [Krotov: A Python implementation of Krotov's method for quantum optimal control](1902.11284v6.md) | |
* [Regular points in affine Springer fibers](math.GT-0309136.md)\n""" | |
assert result.output == expected | |
with runner.isolated_filesystem(): | |
result = runner.invoke( | |
main, | |
[ | |
'--debug', | |
'--write-to-files', | |
'{query_id}.md', | |
'--indent-level', | |
'2', | |
'--obsidian', | |
], | |
input='2011.12310v1 1902.11284v6 cs/0112345v2 math.GT/0309136', | |
# note: cs/0112345v2 does not exist | |
) | |
files = list(Path(".").glob("*.*")) | |
assert result.exit_code == 0 | |
expected = """\ | |
* [[2011.12310v1|Prediction of Toric Code Topological Order from Rydberg Blockade]] | |
* [[1902.11284v6|Krotov: A Python implementation of Krotov's method for quantum optimal control]] | |
* [[math.GT-0309136|Regular points in affine Springer fibers]]\n""" | |
assert result.output == expected | |
with runner.isolated_filesystem(): | |
result = runner.invoke( | |
main, | |
[ | |
'--debug', | |
'--write-to-files', | |
'{authors_comma_separated} - {title}.md', | |
'--indent-level', | |
'2', | |
], | |
input='2011.12310v1 1902.11284v6 cs/0112345v2 math.GT/0309136', | |
# note: cs/0112345v2 does not exist | |
) | |
files = list(Path(".").glob("*.*")) | |
assert result.exit_code == 0 | |
expected = """\ | |
* [Prediction of Toric Code Topological Order from Rydberg Blockade](Ruben Verresen, Mikhail D. Lukin, Ashvin Vishwanath - Prediction of Toric Code Topological Order from Rydberg Blockade.md) | |
* [Krotov: A Python implementation of Krotov's method for quantum optimal control](Michael H. Goerz, Daniel Basilewitsch, Fernando Gago-Encinas, Matthias G. Krauss, Karl P. Horn, Daniel M. Reich, Christiane P. Koch - Krotov A Python implementation of Krotov's method for quantum optimal control.md) | |
* [Regular points in affine Springer fibers](Mark Goresky, Robert Kottwitz, Robert MacPherson - Regular points in affine Springer fibers.md)\n""" | |
assert result.output == expected | |
def has_stdin_data(timeout=0.1): | |
"""Return True if data is detected on stdin with `timeout` (in seconds).""" | |
try: | |
return bool(select.select([sys.stdin], [], [], timeout)[0]) | |
except OSError: | |
return True | |
@click.command() | |
@click.help_option('--help', '-h') | |
@click.version_option(version=__version__) | |
@click.option('--debug', is_flag=True, help='enable debug logging') | |
@click.option( | |
'--indent-level', | |
type=click.IntRange(0, None), | |
default=0, | |
help=_sanitize_paragraphs( | |
"""The indedation level. The STDOUT output will be indented by four | |
spaces per indentation level.""" | |
), | |
show_default=True, | |
) | |
@click.option( | |
'--block-spacing', | |
type=click.IntRange(0, None), | |
default=2, | |
help=_sanitize_paragraphs( | |
"""The number of blank lines between markdown blocks for different | |
arXiv IDs.""" | |
), | |
show_default=True, | |
) | |
@click.option( | |
'--header-level', | |
type=click.IntRange(0, 5), | |
default=2, | |
help=_sanitize_paragraphs( | |
"""The level of the markdown heading to be used for the title of each | |
block. That is, the number of '#' symbols preceding the title.""" | |
), | |
show_default=True, | |
) | |
@click.option( | |
'--write-to-files', | |
'-o', | |
type=click.Path(dir_okay=False), | |
help=_sanitize_paragraphs( | |
"""A pattern for filenames to which to append the markdown block for | |
each arXiv ID. The pattern may contain any of the fields returned by | |
the arxiv_fmt_replacements function (see source code). For example, | |
"{query_id}.md" will write each block to a file named after its arXiv | |
ID. If given in combination with --no-append, the output file is | |
overwritten.""" | |
), | |
) | |
@click.option( | |
'--slugify/--no-slugify', | |
default=False, | |
help=_sanitize_paragraphs( | |
"""If given in combination with --write-to-files, indicates whether | |
filenames should be "slugified" (replace spaces with hyphens, allow | |
only select ASCII characters, convert to lowercase).""" | |
), | |
) | |
@click.option( | |
'--append/--no-append', | |
default=True, | |
help=_sanitize_paragraphs( | |
"""If given in combination with --write-to-files, whether to append to | |
the output file (default) or whether to overwrite it.""" | |
), | |
) | |
@click.option( | |
'--obsidian/--no-obsidian', | |
default=False, | |
help=_sanitize_paragraphs( | |
"""If given in combination with --write-to-files, whether to use | |
Obsidian syntax for linking to the generated files or standard markdown | |
syntax.""" | |
), | |
) | |
@click.argument('ids', nargs=-1) | |
def main( | |
debug, | |
indent_level, | |
block_spacing, | |
header_level, | |
write_to_files, | |
slugify, | |
append, | |
obsidian, | |
ids, | |
): | |
"""Create markdown summaries of given arXiv identifiers. | |
The arXiv identifiers are extracted for the positional arguments as well as | |
from any text piped into the program on STDIN. If "-" is given as the only | |
positional argument, the programs will wait for input from STDIN. | |
Example arXiv IDs are 2011.12310v1, 1902.11284 or (prior to 2007) | |
math.GT/0309136. The script will search for anything that has the correct | |
format, and ignore the remaining input. Thus, you may e.g. pipe a list of | |
arXiv URLs into the script as input. | |
By default, the script will write to STDOUT a block of markdown for each | |
arXiv ID, starting with the manuscript title as a markdown header (see | |
--header-level), followed by author information, the abstract, and relevant | |
links. Multiple blocks are separated by blank lines (see --block-spacing). | |
If the `--write-to-files` option is given, the markdown blocks are instead | |
written to output files, and a markdown-formatted bullet list with the | |
manuscript titles and links to the written files is written to STDOUT. | |
""" | |
logging.basicConfig(level=logging.WARNING) | |
logger = logging.getLogger(__name__) | |
if debug: | |
logger.setLevel(logging.DEBUG) | |
logger.debug("Enabled debug output") | |
if has_stdin_data() or ids == ("-",): | |
logger.debug("Reading from pipe") | |
stdin_text = click.get_text_stream('stdin') | |
input = stdin_text.read().replace("\n", " ") | |
else: | |
logger.debug("Not connected to pipe") | |
input = "" | |
input += " " + " ".join(ids) | |
logger.debug("INPUT: %s", input) | |
ids = ArxivID.find_all(input) | |
logger.debug("FOUND IDS: %s", ids) | |
markdown = arxiv_to_markdown( | |
*ids, | |
indent_level=indent_level, | |
block_spacing=block_spacing, | |
header_level=header_level, | |
write_to_files=write_to_files, | |
slugify=slugify, | |
append=append, | |
obsidian=obsidian, | |
) | |
click.echo(markdown, nl=False) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment