Skip to content

Instantly share code, notes, and snippets.

@goerz
Created December 5, 2020 18:40
Show Gist options
  • Save goerz/dd4e50862348949f8c05fce4596eb13b to your computer and use it in GitHub Desktop.
Save goerz/dd4e50862348949f8c05fce4596eb13b to your computer and use it in GitHub Desktop.
Python script for generating markdown/Obsidian summaries of arXiv files
#!/usr/bin/env python
"""Create markdown summaries from arXiv identifiers.
Test with::
pytest arxiv_summarize.py -s -x --doctest-modules
See ``--help`` for usage summary::
python arxiv_summarize.py --help
"""
# MIT License
#
# Copyright (c) 2020 Michael Goerz
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import collections
import itertools
import logging
import os
import re
import select
import sys
import unicodedata
from pathlib import Path
from textwrap import indent
import arxiv # https://github.com/lukasschwab/arxiv.py
import click
from click.testing import CliRunner
__version__ = '0.1.0'
# fmt: off
# These are all the primary categories present in the OAI ArXiv metadata
CATEGORIES = [
"acc-phys", "adap-org", "alg-geom", "ao-sci", "astro-ph", "atom-ph",
"bayes-an", "chao-dyn", "chem-ph", "cmp-lg", "comp-gas", "cond-mat", "cs",
"dg-ga", "funct-an", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th",
"math", "math-ph", "mtrl-th", "nlin", "nucl-ex", "nucl-th", "patt-sol",
"physics", "plasm-ph", "q-alg", "q-bio", "quant-ph", "solv-int",
"supr-con", "eess", "econ", "q-fin", "stat"
]
# All subcategories with more than 2 capital letters (not SG, SI, SP, etc)
SUB_CATEGORIES = [
'acc-ph', 'ao-ph', 'app-ph', 'atm-clus', 'atom-ph', 'bio-ph', 'chem-ph',
'class-ph', 'comp-ph', 'data-an', 'dis-nn', 'ed-ph', 'flu-dyn', 'gen-ph',
'geo-ph', 'hist-ph', 'ins-det', 'med-ph', 'mes-hall', 'mtrl-sci',
'optics', 'other', 'plasm-ph', 'pop-ph', 'quant-gas', 'soc-ph', 'soft',
'space-ph', 'stat-mech', 'str-el', 'supr-con'
]
# fmt: on
# Regexes adapted from
# https://github.com/mattbierbaum/arxiv-public-datasets/blob/master/arxiv_public_data/regex_arxiv.py
RE_CATEGORIES = r'(?:{})(?:(?:[.][A-Z]{{2}})|(?:{}))?'.format(
r'|'.join(CATEGORIES), r'|'.join(SUB_CATEGORIES)
)
RE_DATE = r'(?:\d{2}[01]\d)' # YYMM
RE_VERSION = r'(?:v[1-9]\d*)?'
RE_NUM_NEW = r'(?:\d{4,5})'
RE_NUM_OLD = r'(?:\d{3})'
# matches: 1612.00001 1203.0023v2
RE_ID_NEW = r'(?:{date}\.{number}{version})'.format(
date=RE_DATE, number=RE_NUM_NEW, version=RE_VERSION
)
RX_ID_NEW = re.compile(
r'(?P<date>{date})\.(?P<number>{number})(?P<version>{version})'.format(
date=RE_DATE, number=RE_NUM_NEW, version=RE_VERSION
)
)
# matches: hep-th/11030234 cs/0112345v2 cs.AI/0112345v2
RE_ID_OLD = r'(?:{cat}/{date}{number}{version})'.format(
cat=RE_CATEGORIES, date=RE_DATE, number=RE_NUM_OLD, version=RE_VERSION
)
RX_ID_OLD = re.compile(
r'(?P<cat>{cat})/(?P<date>{date})(?P<number>{number})(?P<version>{version})'.format(
cat=RE_CATEGORIES, date=RE_DATE, number=RE_NUM_OLD, version=RE_VERSION
)
)
RX_ID = re.compile(r'(?:%s|%s)' % (RE_ID_OLD, RE_ID_NEW))
class ArxivID:
"""An arXiv ID.
This wraps around strings that are valid arXiv IDs in the format described
in https://arxiv.org/help/arxiv_identifier. Both "old-style" IDs
(submissions up to March 2007), e.g.::
>>> id = ArxivID("math.GT/0309136")
>>> id.date
'0309'
>>> id.year
2003
>>> id.month
9
>>> id.num
136
>>> id.number
'136'
>>> id.version # None
>>> id.category
'math.GT'
>>> id.archive
'math'
>>> id.subject
'GT'
and "new-style" IDs (submissions after March 2007), e.g.::
>>> id = ArxivID("1501.00001v1")
>>> id.date
'1501'
>>> id.year
2015
>>> id.month
1
>>> id.num
1
>>> id.number
'00001'
>>> id.version
1
>>> id.category # None
>>> id.archive # None
>>> id.subject # None
are supported. There is no guarantee that an instantiated ArxivID actually
exists, only that is has the correct format.
The IDs are converted to a canonical format, e.g. by padding the article
number with the appropriate number of zeros (4 digits before 2015, 5 digits
starting from 2015)::
>>> ArxivID("0706.00001v2") # extra '0'
ArxivID('0706.0001v2')
An ArxivID may be used in a string context::
>>> print("arXiv:%s" % ArxivID("1501.00001v1"))
arXiv:1501.00001v1
Two IDs compare as equal if they canonically refer to the same submission.
For old-style IDs, the subject-identifier is not taken into account::
>>> ArxivID("math.GT/0309136") == "math/0309136"
True
Also, an arXiv ID that does not contain an explicit version identifier
compares as equal to all versions of the same submission.
>>> ArxivID('1501.00001') == ArxivID('1501.00001v1')
True
>>> ArxivID('1501.00001') == ArxivID('1501.00001v2')
True
Args:
string (str): String from which to extract the ID
search (bool): If True, search for the first valid arXiv ID in
`string`. If False `string` must be a valid arXiv ID by itself.
Attributes:
date (str): The year and month of the arXiv submission in the format
"YYMM"
year (int): The full 4-digit integer year of the arXiv submission
month (int): The month of the arXiv submission
num (int): The number of the arXiv submission, as an integer
number (str): The number of the arXiv submission as a string with the
canonical zero-padding (3 digits up to March 2007, 4 digits up to
December 2014, 5 digits for anything newer)
version (int or None): The version number of the submission. A value of
None refers to the "latest" version.
category (str or None): The full "category", e.g. "math.GT". None for
new-style IDs (since March 2007).
archive (str or None): The archive name, e.g. "math". May be identical
to the `category` when the `subject` is redundant or optional. None
for new-style IDs (since March 2007).
subject (str or None): Optional subject identifier, e.g. "GT"
Raises:
ValueError: If `string` does not contain a valid arXiv ID.
"""
def __init__(self, string, search=True):
parse_mtd = 'match'
if search:
parse_mtd = 'search'
match_new = getattr(RX_ID_NEW, parse_mtd)(str(string))
match_old = getattr(RX_ID_OLD, parse_mtd)(str(string))
if match_new:
self._style = 'new'
self.date = match_new.group('date')
self.year = 2000 + int(self.date[0:2])
self.month = int(self.date[2:4])
self.num = int(match_new.group('number'))
if self.year <= 2012:
self.number = "%04d" % self.num
else:
self.number = "%05d" % self.num
try:
self.version = int(match_new.group('version')[1:])
except (TypeError, ValueError):
self.version = None
self.category = None
self.archive = None
self.subject = None
elif match_old:
self._style = 'old'
self.date = match_old.group('date')
if self.date[0] == '0':
self.year = 2000 + int(self.date[0:2])
else:
self.year = 1900 + int(self.date[0:2])
self.month = int(self.date[2:4])
self.num = int(match_old.group('number'))
self.number = "%03d" % self.num
try:
self.version = int(match_old.group('version')[1:])
except (TypeError, ValueError):
self.version = None
self.category = match_old.group('cat')
self.archive = self.category.split(".")[0]
self.subject = None
if "." in self.category:
self.subject = self.category.split(".")[-1]
else:
raise ValueError("Invalid arXiv ID: %r" % string)
def __str__(self):
v = ""
if self.version is not None:
v = "v%d" % self.version
if self._style == 'new':
return "%s.%s%s" % (self.date, self.number, v)
else:
return "%s/%s%s%s" % (self.category, self.date, self.number, v)
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, str(self))
def __eq__(self, other):
if not isinstance(other, self.__class__):
other = self.__class__(other)
if self.version is not None and other.version is not None:
if self.version != other.version:
return False
attribs = ['date', 'num', 'archive']
# we don't compare the subject, as that seems to be non-canonical
return all([getattr(self, a) == getattr(other, a) for a in attribs])
@classmethod
def find_all(cls, string):
"""Find arXiv IDs in the given `string`.
The IDs are returned as a list in the order that they appear in
`string`, with duplicated removed.
"""
if sys.version_info < (3, 7, 0):
logger = logging.getLogger(__name__)
logger.warning(
"IDs may not be sorted propery. Run on Python >=3.7."
)
return [
cls(k) for k in collections.Counter(RX_ID.findall(string)).keys()
]
MKD_FMT = r'''
{title_prefix}{title}
{authors_and_separated}
{arxiv_url}
{tags}
{abstract}
{links}
'''.lstrip()
def flatten(list_):
"""Flatten the given list."""
flat_list = []
for element in list_:
if isinstance(element, (list, tuple)):
flat_list += flatten(element)
else:
flat_list.append(element)
return flat_list
def make_filename(string, allow_unicode=False, slugify=False):
"""Turn `string` into a "safe" filename.
Convert to ASCII if 'allow_unicode' is False. Convert slashes ("/") to
hyphens ("-"). Remove characters that aren't alphanumerics, underscores,
hyphens, spaces, or periods, and strip leading and trailing whitespace. If
`slugify` is True, also convert to lowercase, convert spaces to
hyphens and strip out most punctuation marks.
"""
string = str(string)
if allow_unicode:
string = unicodedata.normalize('NFKC', string)
else:
string = (
unicodedata.normalize('NFKD', string)
.encode('ascii', 'ignore')
.decode('ascii')
)
string = string.replace("/", "-")
string = re.sub(r'[^\w\s()\[\],."\'?+-]', '', string).strip()
if slugify:
string = string.lower()
string = re.sub(r'[()\[\],"\'?+]', '', string)
string = re.sub(r'[-\s]+', '-', string)
return string
def _sanitize_single_line(string):
string = string.replace("\n", " ")
string = re.sub(r'\s{2,}', ' ', string)
return string
def _sanitize_paragraphs(string):
"""Remove unnecessary line breaks."""
rx = re.compile(
r'''
(?<!\A) # not at the beginning of string
(?<!\n|\s) # not preceded by another newline or whitespace
\n
(?!\n|\s|\Z) # not followed by another newline, whitespace, or
# end of string
''',
re.X,
)
string = " ".join(rx.split(string))
return string
def _tags(query_result):
"""Return a list of tags based on the tags in the arXiv `query_result`."""
return [
"#%s" % tag
for tag in sorted(
set(
flatten(
['arxiv']
+ [tag.term.split(".") for tag in query_result.tags]
)
)
)
]
def _links(query_result):
"""Return a list of markdown-formatted links for the `query_result`.
This includes only titled links, which will usually be the PDF and the DOI
link to any published version of the article.
"""
return [
"[%s](%s)" % (link.title, link.href)
for link in query_result.links
if 'title' in link
]
def arxiv_fmt_replacements(query_id, query_result, header_level=2):
"""Return replacements for formatting a string with arXiv query results.
The result is a dictionary with replacement to be used in a string's
`format` method.
The resulting dict contains the following keys:
* `title_prefix`: Markdown title prefix for a header of level
`header_level`, e.g. "## " for ``header_level=2``.
* `arxiv_comment`: The comment field of the arXiv submission; typically the
number of pages.
* `arxiv_url`: The URL of the abstract on arxiv.org
* `doi`: The DOI of the paper if available, or None
* `journal_reference`: The Journal reference of the paper if available, or
None
* `pdf_url`: The URL for the PDF file on arxiv.org
* `published`: The date on which the arXiv version was originally published
* `abstract`: The abstract text
* `title`: The title of the submission
* `updated`: The date on which the arXiv version was last updated
* `authors`: List of author names
* `authors_and_separated`: A string containing all authors separated by
"and"
* `authors_comma_separated`: A string containing all authors separated by
commas
* `tags`: A string containing a list of tags separated by spaces, including
"arxiv" and tags based on the arXiv submission categories
* `links`: A string containing markdown-formatted links to the PDF and the
published version of the article, if available.
* `id`: The canonical arXiv ID, as reported by the arXiv API
* `query_id`: The arXiv ID as searched for (value of the `query_id`
argument)
"""
keys = [
'arxiv_comment',
'arxiv_url',
'doi',
'journal_reference',
'pdf_url',
'published',
'updated',
'authors',
]
replacements = {k: query_result.get(k, None) for k in keys}
replacements.update(
dict(
title_prefix="#" * header_level + " ",
title=_sanitize_single_line(query_result.title),
authors_and_separated=" and ".join(
[
name.replace(" and ", " {and} ")
for name in query_result.authors
]
),
authors_comma_separated=", ".join(query_result.authors),
abstract=_sanitize_paragraphs(query_result.summary),
tags=" ".join(_tags(query_result)),
links=" ".join(_links(query_result)),
query_id=query_id,
id=str(ArxivID(query_result.id)),
)
)
return replacements
def _arxiv_query(id_list):
"""Wrapper around `arxiv.query` to deal with missing IDs."""
responses = arxiv.query(id_list=[str(id) for id in id_list])
if len(responses) == len(id_list): # all IDs found
for (id, response) in zip(id_list, responses):
# I can't find anywhere that the arXiv API *guarantees" that the
# results are returned in the same order as the query IDs, so we'll
# double check
assert ArxivID(id) == ArxivID(response.id)
return id_list, responses
else: # not all IDs found
response_cycle = itertools.cycle(responses)
responses_filtered = []
id_list_filtered = []
for id in id_list:
attempts = len(responses)
while attempts > 0:
response = next(response_cycle)
if ArxivID(id) == ArxivID(response.id):
id_list_filtered.append(id)
responses_filtered.append(response)
break
attempts -= 1
else:
logger = logging.getLogger(__name__)
logger.error("Cannot find ID %s", id)
return id_list_filtered, responses_filtered
def arxiv_to_markdown(
*arxiv_ids,
indent_level=0,
block_spacing=2,
header_level=2,
write_to_files=None,
slugify=True,
append=True,
obsidian=False,
):
"""Given a list of ArxivIDs, generate and return markdown summaries.
Each ID will be rendered into a "block". Each block will start with the
title of the paper, formatted as a markdown header with the given
`header_level`. Multiple blocks that are written to the same output file
or returned as a result are separated by `block_spacing` number of empty
lines.
If `write_to_files` is given, the block for each ID is written to a
filename based on `write_to_files`. The filename is obtained by formatting
`write_to_files` with replacements, e.g. "{id}" is replaced by the ID and
"{title}" is replaced by the manuscript title, and
"{authors_and_separated}" is
replaced by a list of the author names. Other fields are those available in
the arXiv API response record. The filename is then sanitized through
:func:`make_filename`. If `slugify` is True, non-ASCII letters are dropped
from the filename, spaces are replaced by hyphens, and the filename is
converted to lowercase. A summary bullet list in markdown format linking
to the written files is generated and returned as a multi-line string
result. IF `obsidian` is True, a special syntax appropriate for the
Obsidian note-taking software will be used.
If `write_to_files` is None, the blocks for multiple IDs will be
concatenated (with `block_spacing`) and returned as a multi-line string
result.
In either case, the returned result will be indented with 4 spaces per
`indent_level`.
"""
logger = logging.getLogger(__name__)
arxiv_ids, responses = _arxiv_query(id_list=arxiv_ids)
prefix = ""
if write_to_files is None:
prefix = " " * indent_level
separator = "\n" * block_spacing
blocks = []
block_replacements = []
for (query_id, response) in zip(arxiv_ids, responses):
replacements = arxiv_fmt_replacements(
query_id, response, header_level=header_level
)
block_replacements.append(replacements)
blocks.append(indent(MKD_FMT.format(**replacements), prefix))
outfiles = set() # existing files
if write_to_files is None:
return separator.join(blocks)
else: # write blocks to file, generate and return summary
logger.debug("write_to_files = %s", write_to_files)
summary_items = []
tuples = tuple(zip(arxiv_ids, responses, block_replacements, blocks))
for (id, response, replacements, block) in tuples:
outfile = Path(
make_filename(
write_to_files.format(**replacements),
allow_unicode=(not slugify),
slugify=slugify,
)
)
logger.debug("Writing block for %s to %s", id, outfile)
if append and outfile.is_file():
outfiles.add(outfile)
elif outfile not in outfiles:
outfile.unlink(missing_ok=True)
with outfile.open(mode="a", encoding="utf8") as out_fh:
if outfile in outfiles:
out_fh.write(separator)
out_fh.write(block)
outfiles.add(outfile)
replacements['outfile'] = outfile
replacements['outfile_stem'] = outfile.stem
if obsidian:
summary_items.append(
"* [[{outfile_stem}|{title}]]".format(**replacements)
)
else:
summary_items.append(
"* [{title}]({outfile})".format(**replacements)
)
prefix = " " * indent_level
return indent("\n".join(summary_items), prefix) + "\n"
TEST_OUTPUT = r"""
## Prediction of Toric Code Topological Order from Rydberg Blockade
Ruben Verresen and Mikhail D. Lukin and Ashvin Vishwanath
http://arxiv.org/abs/2011.12310v1
#arxiv #atom-ph #cond-mat #physics #quant-gas #quant-ph #str-el
The physical realization of $\mathbb Z_2$ topological order as encountered in the paradigmatic toric code has proven to be an elusive goal. We show that this phase of matter can be created in a two-dimensional array of strongly interacting Rydberg atoms. Our proposal makes use of atoms localized on the sites of a ruby lattice, coupled via a Rydberg blockade mechanism. First, we show that the blockade model effectively realizes a monomer-dimer model on the kagome lattice with a single-site kinetic term, and we obtain its phase diagram using the numerical density matrix renormalization group method. We find a topological quantum liquid (TQL) as evidenced by multiple measures including (i) a continuous transition between two featureless phases, (ii) a topological entanglement entropy of $\ln 2$ as measured in various geometries, (iii) degenerate topological ground states and (iv) the expected modular matrix from ground state overlap. Next, we show that the TQL can persist upon including realistic, algebraically-decaying van der Waals interactions $V(r) \sim 1/r^6$. Moreover, we can directly access the topological loop operators of this model, which can be measured experimentally using a dynamic protocol, providing a "smoking gun" experimental signature of the TQL phase. Finally, we show how to trap an emergent anyon and realize different topological boundary conditions, and we discuss the implications for exploring fault-tolerant quantum memories.
[pdf](http://arxiv.org/pdf/2011.12310v1)
## Krotov: A Python implementation of Krotov's method for quantum optimal control
Michael H. Goerz and Daniel Basilewitsch and Fernando Gago-Encinas and Matthias G. Krauss and Karl P. Horn and Daniel M. Reich and Christiane P. Koch
http://arxiv.org/abs/1902.11284v6
#arxiv #quant-ph
We present a new open-source Python package, krotov, implementing the quantum optimal control method of that name. It allows to determine time-dependent external fields for a wide range of quantum control problems, including state-to-state transfer, quantum gate implementation and optimization towards an arbitrary perfect entangler. Krotov's method compares to other gradient-based optimization methods such as gradient-ascent and guarantees monotonic convergence for approximately time-continuous control fields. The user-friendly interface allows for combination with other Python packages, and thus high-level customization. The package is being developed at https://github.com/qucontrol/krotov
[doi](http://dx.doi.org/10.21468/SciPostPhys.7.6.080) [pdf](http://arxiv.org/pdf/1902.11284v6)
""".lstrip()
def test_arxiv_to_markdown():
"""Test of `arxiv_to_markdown` function."""
md = arxiv_to_markdown('2011.12310', '1902.11284')
assert md[-1] == "\n"
assert md == TEST_OUTPUT
def test_arxiv_id_regex():
"""Test the regexes for extracting arXiv identifiers."""
for id in ['1612.00001', '1203.0023v2']:
assert re.match(RE_ID_NEW, id)
assert RX_ID_NEW.match(id)
for id in ['hep-th/11030234', 'cs/0112345v2', 'cs.AI/0112345v2']:
assert re.match(RE_ID_OLD, id)
assert RX_ID_OLD.match(id)
input = (
"http://arxiv.org/abs/1902.11284v6 2011.12310 arxiv:math.GT/0309136"
)
arxiv_ids = RX_ID.findall(input)
assert arxiv_ids == ['1902.11284v6', '2011.12310', 'math.GT/0309136']
def test_arxiv_id_extraction():
"""Test that duplicate IDs are excluded in arxiv_ids."""
input = "http://arxiv.org/abs/1902.11284v6,1203.0023v2,1902.11284v6 1203.0023v2 1612.00001 cs/0112345v2"
assert ArxivID.find_all(input) == [
ArxivID("1902.11284v6"),
ArxivID("1203.0023v2"),
ArxivID("1612.00001"),
ArxivID("cs/0112345v2"),
]
def test_write_to_files(caplog):
runner = CliRunner(mix_stderr=False)
with runner.isolated_filesystem():
result = runner.invoke(
main,
[
'--debug',
'--write-to-files',
'abstracts.md',
'--indent-level',
'2',
],
input='2011.12310v1 1902.11284v6',
)
assert result.exit_code == 0
files = list(Path(".").glob("*.*"))
assert Path("abstracts.md") in files
abstracts_md = Path("abstracts.md").read_text()
expected = """\
* [Prediction of Toric Code Topological Order from Rydberg Blockade](abstracts.md)
* [Krotov: A Python implementation of Krotov's method for quantum optimal control](abstracts.md)\n"""
assert result.output == expected
assert abstracts_md == TEST_OUTPUT
with runner.isolated_filesystem():
result = runner.invoke(
main,
[
'--debug',
'--write-to-files',
'{query_id}.md',
'--indent-level',
'2',
],
input='2011.12310v1 1902.11284v6 cs/0112345v2 math.GT/0309136',
# note: cs/0112345v2 does not exist
)
files = list(Path(".").glob("*.*"))
assert result.exit_code == 0
error_msgs = [
x.message
for x in caplog.get_records(when="call")
if x.levelno == logging.ERROR
]
assert result.exit_code == 0
assert len(error_msgs) == 1
assert error_msgs[0] == "Cannot find ID cs/0112345v2"
assert Path('2011.12310v1.md') in files
assert Path('1902.11284v6.md') in files
assert Path('math.GT-0309136.md') in files
expected = """\
* [Prediction of Toric Code Topological Order from Rydberg Blockade](2011.12310v1.md)
* [Krotov: A Python implementation of Krotov's method for quantum optimal control](1902.11284v6.md)
* [Regular points in affine Springer fibers](math.GT-0309136.md)\n"""
assert result.output == expected
with runner.isolated_filesystem():
result = runner.invoke(
main,
[
'--debug',
'--write-to-files',
'{query_id}.md',
'--indent-level',
'2',
'--obsidian',
],
input='2011.12310v1 1902.11284v6 cs/0112345v2 math.GT/0309136',
# note: cs/0112345v2 does not exist
)
files = list(Path(".").glob("*.*"))
assert result.exit_code == 0
expected = """\
* [[2011.12310v1|Prediction of Toric Code Topological Order from Rydberg Blockade]]
* [[1902.11284v6|Krotov: A Python implementation of Krotov's method for quantum optimal control]]
* [[math.GT-0309136|Regular points in affine Springer fibers]]\n"""
assert result.output == expected
with runner.isolated_filesystem():
result = runner.invoke(
main,
[
'--debug',
'--write-to-files',
'{authors_comma_separated} - {title}.md',
'--indent-level',
'2',
],
input='2011.12310v1 1902.11284v6 cs/0112345v2 math.GT/0309136',
# note: cs/0112345v2 does not exist
)
files = list(Path(".").glob("*.*"))
assert result.exit_code == 0
expected = """\
* [Prediction of Toric Code Topological Order from Rydberg Blockade](Ruben Verresen, Mikhail D. Lukin, Ashvin Vishwanath - Prediction of Toric Code Topological Order from Rydberg Blockade.md)
* [Krotov: A Python implementation of Krotov's method for quantum optimal control](Michael H. Goerz, Daniel Basilewitsch, Fernando Gago-Encinas, Matthias G. Krauss, Karl P. Horn, Daniel M. Reich, Christiane P. Koch - Krotov A Python implementation of Krotov's method for quantum optimal control.md)
* [Regular points in affine Springer fibers](Mark Goresky, Robert Kottwitz, Robert MacPherson - Regular points in affine Springer fibers.md)\n"""
assert result.output == expected
def has_stdin_data(timeout=0.1):
"""Return True if data is detected on stdin with `timeout` (in seconds)."""
try:
return bool(select.select([sys.stdin], [], [], timeout)[0])
except OSError:
return True
@click.command()
@click.help_option('--help', '-h')
@click.version_option(version=__version__)
@click.option('--debug', is_flag=True, help='enable debug logging')
@click.option(
'--indent-level',
type=click.IntRange(0, None),
default=0,
help=_sanitize_paragraphs(
"""The indedation level. The STDOUT output will be indented by four
spaces per indentation level."""
),
show_default=True,
)
@click.option(
'--block-spacing',
type=click.IntRange(0, None),
default=2,
help=_sanitize_paragraphs(
"""The number of blank lines between markdown blocks for different
arXiv IDs."""
),
show_default=True,
)
@click.option(
'--header-level',
type=click.IntRange(0, 5),
default=2,
help=_sanitize_paragraphs(
"""The level of the markdown heading to be used for the title of each
block. That is, the number of '#' symbols preceding the title."""
),
show_default=True,
)
@click.option(
'--write-to-files',
'-o',
type=click.Path(dir_okay=False),
help=_sanitize_paragraphs(
"""A pattern for filenames to which to append the markdown block for
each arXiv ID. The pattern may contain any of the fields returned by
the arxiv_fmt_replacements function (see source code). For example,
"{query_id}.md" will write each block to a file named after its arXiv
ID. If given in combination with --no-append, the output file is
overwritten."""
),
)
@click.option(
'--slugify/--no-slugify',
default=False,
help=_sanitize_paragraphs(
"""If given in combination with --write-to-files, indicates whether
filenames should be "slugified" (replace spaces with hyphens, allow
only select ASCII characters, convert to lowercase)."""
),
)
@click.option(
'--append/--no-append',
default=True,
help=_sanitize_paragraphs(
"""If given in combination with --write-to-files, whether to append to
the output file (default) or whether to overwrite it."""
),
)
@click.option(
'--obsidian/--no-obsidian',
default=False,
help=_sanitize_paragraphs(
"""If given in combination with --write-to-files, whether to use
Obsidian syntax for linking to the generated files or standard markdown
syntax."""
),
)
@click.argument('ids', nargs=-1)
def main(
debug,
indent_level,
block_spacing,
header_level,
write_to_files,
slugify,
append,
obsidian,
ids,
):
"""Create markdown summaries of given arXiv identifiers.
The arXiv identifiers are extracted for the positional arguments as well as
from any text piped into the program on STDIN. If "-" is given as the only
positional argument, the programs will wait for input from STDIN.
Example arXiv IDs are 2011.12310v1, 1902.11284 or (prior to 2007)
math.GT/0309136. The script will search for anything that has the correct
format, and ignore the remaining input. Thus, you may e.g. pipe a list of
arXiv URLs into the script as input.
By default, the script will write to STDOUT a block of markdown for each
arXiv ID, starting with the manuscript title as a markdown header (see
--header-level), followed by author information, the abstract, and relevant
links. Multiple blocks are separated by blank lines (see --block-spacing).
If the `--write-to-files` option is given, the markdown blocks are instead
written to output files, and a markdown-formatted bullet list with the
manuscript titles and links to the written files is written to STDOUT.
"""
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
if debug:
logger.setLevel(logging.DEBUG)
logger.debug("Enabled debug output")
if has_stdin_data() or ids == ("-",):
logger.debug("Reading from pipe")
stdin_text = click.get_text_stream('stdin')
input = stdin_text.read().replace("\n", " ")
else:
logger.debug("Not connected to pipe")
input = ""
input += " " + " ".join(ids)
logger.debug("INPUT: %s", input)
ids = ArxivID.find_all(input)
logger.debug("FOUND IDS: %s", ids)
markdown = arxiv_to_markdown(
*ids,
indent_level=indent_level,
block_spacing=block_spacing,
header_level=header_level,
write_to_files=write_to_files,
slugify=slugify,
append=append,
obsidian=obsidian,
)
click.echo(markdown, nl=False)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment