ralsina · August 19, 2013 02:38
diff --git a/hyphenate-poc.py b/hyphenate-poc.py
 import codecs
 import sys

 import lxml.html
 from lxml import etree
 import pyphen

 lang="en"

 def insert_hyphens(node, hyphenator):
    textattrs = ('text', 'tail')
    if isinstance(node, etree._Entity):
        # HTML entities have no .text
        textattrs = ('tail',)
    for attr in textattrs:
        text = getattr(node, attr)
        if not text:
            continue
        new_data = ' '.join([hyphenator.inserted(w, hyphen=u'\u00AD')
            for w in text.split()])
        # Spaces are trimmed, we have to add them manually back
        if text[0].isspace():
            new_data = ' ' + new_data
        if text[-1].isspace():
            new_data += ' '
        setattr(node, attr, new_data)

    for child in node.iterchildren():
        insert_hyphens(child, hyphenator)

 dom = lxml.html.parse(sys.argv[1])
 hyphenator = pyphen.Pyphen(lang=lang)
 for tag in ('p', 'div', 'li', 'span'):
    for node in dom.xpath("//%s" % tag):
        insert_hyphens(node, hyphenator)

 with codecs.open('foo.html', 'w+', 'utf-8') as f:
    f.write(lxml.html.tostring(dom, encoding='unicode'))
	import codecs
	import sys

	import lxml.html
	from lxml import etree
	import pyphen

	lang="en"

	def insert_hyphens(node, hyphenator):
	textattrs = ('text', 'tail')
	if isinstance(node, etree._Entity):
	# HTML entities have no .text
	textattrs = ('tail',)
	for attr in textattrs:
	text = getattr(node, attr)
	if not text:
	continue
	new_data = ' '.join([hyphenator.inserted(w, hyphen=u'\u00AD')
	for w in text.split()])
	# Spaces are trimmed, we have to add them manually back
	if text[0].isspace():
	new_data = ' ' + new_data
	if text[-1].isspace():
	new_data += ' '
	setattr(node, attr, new_data)

	for child in node.iterchildren():
	insert_hyphens(child, hyphenator)

	dom = lxml.html.parse(sys.argv[1])
	hyphenator = pyphen.Pyphen(lang=lang)
	for tag in ('p', 'div', 'li', 'span'):
	for node in dom.xpath("//%s" % tag):
	insert_hyphens(node, hyphenator)

	with codecs.open('foo.html', 'w+', 'utf-8') as f:
	f.write(lxml.html.tostring(dom, encoding='unicode'))