Created
March 29, 2017 17:41
-
-
Save brazilbean/3ebb31324f6dad212817b3663c7a0219 to your computer and use it in GitHub Desktop.
Custom Jupyter Notebook Pre-processors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Specialized Preprocessors''' | |
import nbconvert, nbformat, re, sys | |
from nbconvert.preprocessors import ExecutePreprocessor, Preprocessor | |
from traitlets import Dict, Unicode | |
from textwrap import dedent | |
from warnings import warn | |
def warn_deprecated(msg): | |
'''Raise a DeprecationWarning''' | |
warn(msg, DeprecationWarning, stacklevel=2) | |
class RemoveCodePreprocessor(Preprocessor): | |
def _should_remove(self, cell): | |
if cell.cell_type == 'code': | |
return True | |
elif cell.cell_type == 'markdown': | |
# For cells starting with headers | |
# The first token will be a set of #'s | |
# Check for a '!' as the second token in the source | |
tokens = cell.source.split() | |
return all(t == '#' for t in tokens[0]) and tokens[1] == '!' | |
else: | |
return False | |
def preprocess(self, notebook, resources): | |
'''Skip code cells and special markdown cells''' | |
notebook.cells = [cell for cell in notebook.cells if not self._should_remove(cell)] | |
return notebook, resources | |
class FilterCodePreprocessor(Preprocessor): | |
def _should_filter(self, cell): | |
'''Determine whether a cell should be filtered''' | |
if cell.cell_type != 'code': | |
return False | |
else: | |
# Check for a ##Hidecell comment | |
if re.search('##hidecell', cell.source, re.I) is not None: | |
return True | |
else: | |
return False | |
def preprocess(self, notebook, resources): | |
'''Filter code cells''' | |
notebook.cells = [cell for cell in notebook.cells if not self._should_filter(cell)] | |
return notebook, resources | |
class ClearEmptyRawCellsPreprocessor(Preprocessor): | |
'''Remove empty raw cells from the notebook''' | |
def preprocess(self, notebook, resources): | |
notebook.cells = [cell for cell in notebook.cells | |
if not (cell.cell_type == 'raw' and cell.source == '')] | |
return notebook, resources | |
class ArgumentSubstitutionPreprocessor(Preprocessor): | |
args = Dict(Unicode(), | |
default_value={}).tag(config=True) | |
def replace_variables(self, source, variables): | |
""" | |
Replace <<variablename>> with stored value | |
""" | |
try: | |
replaced = re.sub("<<(.*?)>>", lambda m: variables.get(m.group(1), m.group(1)), source) | |
except TypeError: | |
print("WARNING: unable to perform replacement in cell: {}".format(source), sys.stderr) | |
replaced = source | |
return replaced | |
def preprocess_cell(self, cell, resources, index): | |
""" | |
Preprocess cell | |
Parameters | |
---------- | |
cell : NotebookNode cell | |
Notebook cell being processed | |
resources : dictionary | |
Additional resources used in the conversion process. Allows | |
preprocessors to pass variables into the Jinja engine. | |
cell_index : int | |
Index of the cell being processed (see base.py) | |
""" | |
if cell.cell_type == "code": | |
if len(self.args) > 0: | |
cell.source = self.replace_variables(cell.source, self.args) | |
return cell, resources | |
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor): | |
def __init__(self, **kw): | |
self.sections = {'default': True} # maps section ID to true or false | |
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("") | |
self.MetaCodeProcessor = MetaCodePreprocessor(self) | |
return super().__init__(**kw) | |
def preprocess_cell(self, cell, resources, cell_index): | |
""" | |
Executes a single code cell. See base.py for details. | |
To execute all cells see :meth:`preprocess_cell`. | |
""" | |
try: | |
cell, resources = self.MetaCodeProcessor.process_cell(cell, resources) | |
if cell is None: | |
return self.EmptyCell, resources | |
if cell.cell_type not in ['code','markdown']: | |
return cell, resources | |
if cell.cell_type == 'code': | |
# Do code stuff | |
return self.preprocess_code_cell(cell, resources, cell_index) | |
elif cell.cell_type == 'markdown': | |
# Do markdown stuff | |
return self.preprocess_markdown_cell(cell, resources, cell_index) | |
else: | |
# Don't do anything | |
return cell, resources | |
except TimeoutError: | |
print("Timeout on execution of cell: {}".format(cell.source), file=sys.stderr, flush=True) | |
raise | |
def preprocess_code_cell(self, cell, resources, cell_index): | |
''' Process code cell. ''' | |
outputs = self.run_cell(cell) | |
cell.outputs = outputs | |
if not self.allow_errors: | |
for out in outputs: | |
if out.output_type == 'error': | |
pattern = u"""\ | |
An error occurred while executing the following cell: | |
------------------ | |
{cell.source} | |
------------------ | |
{out.ename}: {out.evalue} | |
""" | |
msg = dedent(pattern).format(out=out, cell=cell) | |
raise nbconvert.preprocessors.execute.CellExecutionError(msg) | |
return cell, resources | |
def preprocess_markdown_cell(self, cell, resources, cell_index): | |
# Find and execute snippets of code | |
cell['metadata']['variables'] = {} | |
for m in re.finditer("{{(.*?)}}", cell.source): | |
# Execute code | |
fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1)) | |
fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index) | |
# Output found in cell.outputs | |
# Put output in cell['metadata']['variables'] | |
for output in fakecell.outputs: | |
html = self.convert_output_to_html(output) | |
if html is not None: | |
cell['metadata']['variables'][fakecell.source] = html | |
break | |
return cell, resources | |
def convert_output_to_html(self, output): | |
'''Convert IOpub output to HTML | |
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js | |
''' | |
if output['output_type'] == 'error': | |
text = '**' + output.ename + '**: ' + output.evalue; | |
return text | |
elif output.output_type == 'execute_result' or output.output_type == 'display_data': | |
data = output.data | |
if 'text/latex' in data: | |
html = data['text/latex'] | |
return html | |
elif 'image/svg+xml' in data: | |
# Not supported | |
#var svg = ul['image/svg+xml']; | |
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */ | |
#svg = btoa(svg); | |
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>'; | |
return None | |
elif 'image/jpeg' in data: | |
jpeg = data['image/jpeg'] | |
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>' | |
return html | |
elif 'image/png' in data: | |
png = data['image/png'] | |
html = '<img src="data:image/png;base64,' + png + '"/>' | |
return html | |
elif 'text/markdown' in data: | |
text = data['text/markdown'] | |
return text | |
elif 'text/html' in data: | |
html = data['text/html'] | |
return html | |
elif 'text/plain' in data: | |
text = data['text/plain'] | |
# Strip <p> and </p> tags | |
# Strip quotes | |
# html.match(/<p>([\s\S]*?)<\/p>/)[1] | |
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text) | |
text = re.sub(r"'([\s\S]*?)'",r'\1', text) | |
return text | |
else: | |
# Some tag we don't support | |
return None | |
else: | |
return None | |
class MetaCodePreprocessor: | |
def __init__(self, cell_runner): | |
self._in_exclude_mode = False | |
self.cell_runner = cell_runner | |
def _output_to_bool(self, output): | |
'''Convert cell execution output to a boolean''' | |
if not output: | |
# Empty string is false | |
return False | |
else: | |
try: | |
# Return the boolean version of evaluating the string | |
# Should handle cases like "True", "False", "0", "1", etc. | |
return bool(eval(output)) | |
except NameError: | |
# The output did not comprise a valid python expression | |
# Return True | |
return True | |
def _evaluate_cell(self, cell): | |
'''Evaluate the cell and return True or False''' | |
# Execute the cell source to see whether the section should be kept | |
# Default to True | |
outputs = self.cell_runner.run_cell(cell) | |
for output in outputs: | |
if "data" in output and "text/plain" in output.data: | |
return self._output_to_bool(output.data["text/plain"]) | |
return True | |
def process_cell(self, cell, resources): | |
'''Identify and process metacode tags | |
Metacode tags are found on the first line of the cell and start with "#$" | |
''' | |
m = re.match(r'\#\$\s+(\w+)\s+(.+)', cell.source) | |
if m: | |
command = m.group(1) | |
args = m.group(2) | |
if command == 'if': | |
# Start if block | |
self._in_exclude_mode = not self._evaluate_cell(cell) | |
elif command == 'else': | |
# Start else block | |
self._in_exclude_mode = not self._in_exclude_mode | |
elif command == 'end': | |
# End block | |
if args.split()[0] == 'if': | |
# End if/else block | |
self._in_exclude_mode = False | |
else: | |
print("Unrecognized metacode end tag: " + m.group(0)) | |
elif command == 'endif': | |
# Deprecated: end if/else block | |
warn_deprecated("'endif'' is deprecated. Use 'end if'") | |
self._in_exclude_mode = False | |
elif command == 'section': | |
# Deprecated functionality kept for backwards compatibility | |
warn_deprecated("'section' tag is deprecated. Use 'if' and 'end if'") | |
if 'start' in args.lower(): | |
# Start section block | |
self._in_exclude_mode = not self._evaluate_cell(cell) | |
elif 'end' in args.lower(): | |
# End section block | |
self._in_exclude_mode = False | |
else: | |
# Unrecognized section tag | |
print("Unrecognized metacode tag: " + m.group(0)) | |
# All cells with metacode tags are removed | |
return None, resources | |
else: | |
if self._in_exclude_mode: | |
return None, resources | |
else: | |
return cell, resources | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's what I wrote recently for that purpose:
This little preprocessor is available on pypi as
jupybeans
. For example:pip install jupybeans jupyter nbconvert "notebook.ipynb" --Exporter.preprocessors jupybeans.RemoveSkip --to html_embed