Skip to content

Instantly share code, notes, and snippets.

@soamaven
Last active May 2, 2018 06:54
Show Gist options
  • Save soamaven/4de1727f76790b574342bd6231402843 to your computer and use it in GitHub Desktop.
Save soamaven/4de1727f76790b574342bd6231402843 to your computer and use it in GitHub Desktop.
Parser and converter for jupyter markdown cells for nbconvert
"""HTML Image handling for embedded images in markdown cells."""
#-----------------------------------------------------------------------------
# Copyright (c) 2013, the IPython Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------
from ipython_genutils.py3compat import PY3
if PY3:
from html.parser import HTMLParser
else:
from HTMLParser import HTMLParser
import base64
import os.path
#-----------------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------------
__all__ = ['img2base64']
def img2base64(s):
"""Parse HTML image references in Markdown cells.
This looks for HTML tags having a img tag name `img`
and converts the image to a data URI for static embedding.
The tranformation looks like this:
`<img src="./Images/My_image.png" width="800" height="800" alt="Alt_name" title="Mytitle" align="center" />`
Becomes
`<img src="..." width="800" height="800" alt="Alt_name" title="Mytitle" align="center" />`
Any HTML tag can be used, which allows the citations to be formatted
in HTML in any manner.
"""
parser = Img2Base64Parser()
parser.feed(s)
parser.close()
outtext = u''
startpos = 0
for img in parser.imglist:
outtext += s[startpos:img[1][0]]
outtext += 'data:image/%s;base64,%s'% \
(img[0][1],str(img[0][0]).lstrip('b\'').rstrip('\''))
startpos = img[1][1] if len(img)==3 else -1
outtext += s[startpos:] if startpos != -1 else ''
return outtext
#-----------------------------------------------------------------------------
# Classes
#-----------------------------------------------------------------------------
class Img2Base64Parser(HTMLParser):
"""Image Parser
Replaces html img file references with base64 encoded strings.
Inherites from HTMLParser, overrides:
- handle_starttag
- handle_endtag
"""
# number of open tags
opentags = None
# list of found imgs
imglist = None
# active img tag
imgtag = None
def __init__(self):
self.imglist = []
self.opentags = 0
HTMLParser.__init__(self)
def get_offset(self):
# Compute startposition in source
lin, offset = self.getpos()
pos = 0
for i in range(lin):
pos = self.data.find('src=',pos) + 5
return pos
def handle_starttag(self, tag, attrs):
# for each tag check if attributes are present and convert src to base64
if self.opentags == 0 and len(attrs)>0:
for atr, data in attrs:
if atr.lower() == 'src':
self.imgtag = tag
self.opentags = 1
print(len(data))
with open(data, "rb") as image_file:
encoded_data = base64.b64encode(image_file.read())
extension = os.path.splitext(data)[1][1:].strip().lower()
self.imglist.append([[encoded_data, extension],
[self.get_offset(), self.get_offset()+len(data)]])
return
if tag == self.imgtag:
# found an open img tag but not the starting one
self.opentags += 1
def handle_endtag(self, tag):
if tag == self.imgtag:
# found img tag check if starting one
if self.opentags == 1:
pos = self.get_offset()
self.imglist[-1].append(pos+len(tag)+ 3)
self.opentags -= 1
def feed(self, data):
self.data = data
HTMLParser.feed(self, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment