Skip to content

Instantly share code, notes, and snippets.

@ralsina
Created August 19, 2013 02:38
Show Gist options
  • Save ralsina/6265362 to your computer and use it in GitHub Desktop.
Save ralsina/6265362 to your computer and use it in GitHub Desktop.
A proof of concept hyphenator for HTML files using pyphen, based on https://github.com/dmalinovsky/kindle-hyphens/blob/master/main.py
import codecs
import sys
import lxml.html
from lxml import etree
import pyphen
lang="en"
def insert_hyphens(node, hyphenator):
textattrs = ('text', 'tail')
if isinstance(node, etree._Entity):
# HTML entities have no .text
textattrs = ('tail',)
for attr in textattrs:
text = getattr(node, attr)
if not text:
continue
new_data = ' '.join([hyphenator.inserted(w, hyphen=u'\u00AD')
for w in text.split()])
# Spaces are trimmed, we have to add them manually back
if text[0].isspace():
new_data = ' ' + new_data
if text[-1].isspace():
new_data += ' '
setattr(node, attr, new_data)
for child in node.iterchildren():
insert_hyphens(child, hyphenator)
dom = lxml.html.parse(sys.argv[1])
hyphenator = pyphen.Pyphen(lang=lang)
for tag in ('p', 'div', 'li', 'span'):
for node in dom.xpath("//%s" % tag):
insert_hyphens(node, hyphenator)
with codecs.open('foo.html', 'w+', 'utf-8') as f:
f.write(lxml.html.tostring(dom, encoding='unicode'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment