Last active
May 2, 2018 06:54
-
-
Save soamaven/4de1727f76790b574342bd6231402843 to your computer and use it in GitHub Desktop.
Parser and converter for jupyter markdown cells for nbconvert
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""HTML Image handling for embedded images in markdown cells.""" | |
#----------------------------------------------------------------------------- | |
# Copyright (c) 2013, the IPython Development Team. | |
# | |
# Distributed under the terms of the Modified BSD License. | |
# | |
# The full license is in the file COPYING.txt, distributed with this software. | |
#----------------------------------------------------------------------------- | |
#----------------------------------------------------------------------------- | |
# Imports | |
#----------------------------------------------------------------------------- | |
from ipython_genutils.py3compat import PY3 | |
if PY3: | |
from html.parser import HTMLParser | |
else: | |
from HTMLParser import HTMLParser | |
import base64 | |
import os.path | |
#----------------------------------------------------------------------------- | |
# Functions | |
#----------------------------------------------------------------------------- | |
__all__ = ['img2base64'] | |
def img2base64(s): | |
"""Parse HTML image references in Markdown cells. | |
This looks for HTML tags having a img tag name `img` | |
and converts the image to a data URI for static embedding. | |
The tranformation looks like this: | |
`<img src="./Images/My_image.png" width="800" height="800" alt="Alt_name" title="Mytitle" align="center" />` | |
Becomes | |
`<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADIA..." width="800" height="800" alt="Alt_name" title="Mytitle" align="center" />` | |
Any HTML tag can be used, which allows the citations to be formatted | |
in HTML in any manner. | |
""" | |
parser = Img2Base64Parser() | |
parser.feed(s) | |
parser.close() | |
outtext = u'' | |
startpos = 0 | |
for img in parser.imglist: | |
outtext += s[startpos:img[1][0]] | |
outtext += 'data:image/%s;base64,%s'% \ | |
(img[0][1],str(img[0][0]).lstrip('b\'').rstrip('\'')) | |
startpos = img[1][1] if len(img)==3 else -1 | |
outtext += s[startpos:] if startpos != -1 else '' | |
return outtext | |
#----------------------------------------------------------------------------- | |
# Classes | |
#----------------------------------------------------------------------------- | |
class Img2Base64Parser(HTMLParser): | |
"""Image Parser | |
Replaces html img file references with base64 encoded strings. | |
Inherites from HTMLParser, overrides: | |
- handle_starttag | |
- handle_endtag | |
""" | |
# number of open tags | |
opentags = None | |
# list of found imgs | |
imglist = None | |
# active img tag | |
imgtag = None | |
def __init__(self): | |
self.imglist = [] | |
self.opentags = 0 | |
HTMLParser.__init__(self) | |
def get_offset(self): | |
# Compute startposition in source | |
lin, offset = self.getpos() | |
pos = 0 | |
for i in range(lin): | |
pos = self.data.find('src=',pos) + 5 | |
return pos | |
def handle_starttag(self, tag, attrs): | |
# for each tag check if attributes are present and convert src to base64 | |
if self.opentags == 0 and len(attrs)>0: | |
for atr, data in attrs: | |
if atr.lower() == 'src': | |
self.imgtag = tag | |
self.opentags = 1 | |
print(len(data)) | |
with open(data, "rb") as image_file: | |
encoded_data = base64.b64encode(image_file.read()) | |
extension = os.path.splitext(data)[1][1:].strip().lower() | |
self.imglist.append([[encoded_data, extension], | |
[self.get_offset(), self.get_offset()+len(data)]]) | |
return | |
if tag == self.imgtag: | |
# found an open img tag but not the starting one | |
self.opentags += 1 | |
def handle_endtag(self, tag): | |
if tag == self.imgtag: | |
# found img tag check if starting one | |
if self.opentags == 1: | |
pos = self.get_offset() | |
self.imglist[-1].append(pos+len(tag)+ 3) | |
self.opentags -= 1 | |
def feed(self, data): | |
self.data = data | |
HTMLParser.feed(self, data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment