Skip to content

Instantly share code, notes, and snippets.

@mmas
Created June 20, 2016 20:39
Show Gist options
  • Save mmas/f796c73e134565a3d2698e459a3c613a to your computer and use it in GitHub Desktop.
Save mmas/f796c73e134565a3d2698e459a3c613a to your computer and use it in GitHub Desktop.
Truncate an HTML text to a certain number of words
from HTMLParser import HTMLParser
import re
re_whitespace = re.compile(r'(\w+)')
class WordTruncatedHTMLParser(HTMLParser):
"""
Truncate an HTML text to a certain number of words.
Based on https://late.am/post/2011/12/02/truncating-html-with-python.html
Example:
>>> html = ('<p>one, <span class="bar">two</span>, <label>three'
'</label>,<label class="baz">four</label>,<div><span>five'
'</span>,<span>six</span></div></p><p>seven, '
'<img src="img.png"/>, eight, nine, ten</p><p><span>eleven'
'</span><span>twelve</span></p>')
>>> parser = WordTruncatedHTMLParser(maxlength=8)
>>> parser.feed(html)
>>> print parser.close()
<p>one, <span class="bar">two</span>, <label>three</label>,<label class
="baz">four</label>,<div><span>five</span>,<span>six</span></div></p><p
>seven, <img src="img.png"/>, eight</p>
"""
def __init__(self, maxlength, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
self.stack = []
self.maxlength = maxlength
self.length = 0
self.out = []
def emit(self, thing, count=False):
if count:
self.length += 1
self.out.append(thing)
if self.length == self.maxlength:
# Trim right space.
self.out[-1] = self.out[-1].rstrip()
# Close out opened tags.
for tag in reversed(self.stack):
self.out.append('</%s>' % tag)
def handle_starttag(self, tag, attrs):
if self.length == self.maxlength:
return
self.stack.append(tag)
attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs)
self.emit('<%s%s>' % (tag, (' ' + attrs).rstrip()))
def handle_endtag(self, tag):
if self.length == self.maxlength:
return
if tag == self.stack[-1]:
self.emit('</%s>' % tag)
del self.stack[-1]
else:
raise Exception(
'end tag %r does not match stack: %r' % (tag, self.stack))
def handle_startendtag(self, tag, attrs):
if self.length == self.maxlength:
return
attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs)
self.emit('<%s%s/>' % (tag, (' ' + attrs).rstrip()))
def handle_data(self, data):
if self.length == self.maxlength:
return
for word in re_whitespace.split(data):
if self.length != self.maxlength:
self.emit(word, word.isalnum())
def handle_entityref(self, name):
if self.length == self.maxlength:
return
self.emit('&%s;' % name)
def handle_charref(self, name):
return self.handle_entityref('#%s' % name)
def close(self):
return ''.join(self.out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment