Created
June 20, 2016 20:39
-
-
Save mmas/f796c73e134565a3d2698e459a3c613a to your computer and use it in GitHub Desktop.
Truncate an HTML text to a certain number of words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from HTMLParser import HTMLParser | |
import re | |
re_whitespace = re.compile(r'(\w+)') | |
class WordTruncatedHTMLParser(HTMLParser): | |
""" | |
Truncate an HTML text to a certain number of words. | |
Based on https://late.am/post/2011/12/02/truncating-html-with-python.html | |
Example: | |
>>> html = ('<p>one, <span class="bar">two</span>, <label>three' | |
'</label>,<label class="baz">four</label>,<div><span>five' | |
'</span>,<span>six</span></div></p><p>seven, ' | |
'<img src="img.png"/>, eight, nine, ten</p><p><span>eleven' | |
'</span><span>twelve</span></p>') | |
>>> parser = WordTruncatedHTMLParser(maxlength=8) | |
>>> parser.feed(html) | |
>>> print parser.close() | |
<p>one, <span class="bar">two</span>, <label>three</label>,<label class | |
="baz">four</label>,<div><span>five</span>,<span>six</span></div></p><p | |
>seven, <img src="img.png"/>, eight</p> | |
""" | |
def __init__(self, maxlength, *args, **kwargs): | |
HTMLParser.__init__(self, *args, **kwargs) | |
self.stack = [] | |
self.maxlength = maxlength | |
self.length = 0 | |
self.out = [] | |
def emit(self, thing, count=False): | |
if count: | |
self.length += 1 | |
self.out.append(thing) | |
if self.length == self.maxlength: | |
# Trim right space. | |
self.out[-1] = self.out[-1].rstrip() | |
# Close out opened tags. | |
for tag in reversed(self.stack): | |
self.out.append('</%s>' % tag) | |
def handle_starttag(self, tag, attrs): | |
if self.length == self.maxlength: | |
return | |
self.stack.append(tag) | |
attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs) | |
self.emit('<%s%s>' % (tag, (' ' + attrs).rstrip())) | |
def handle_endtag(self, tag): | |
if self.length == self.maxlength: | |
return | |
if tag == self.stack[-1]: | |
self.emit('</%s>' % tag) | |
del self.stack[-1] | |
else: | |
raise Exception( | |
'end tag %r does not match stack: %r' % (tag, self.stack)) | |
def handle_startendtag(self, tag, attrs): | |
if self.length == self.maxlength: | |
return | |
attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs) | |
self.emit('<%s%s/>' % (tag, (' ' + attrs).rstrip())) | |
def handle_data(self, data): | |
if self.length == self.maxlength: | |
return | |
for word in re_whitespace.split(data): | |
if self.length != self.maxlength: | |
self.emit(word, word.isalnum()) | |
def handle_entityref(self, name): | |
if self.length == self.maxlength: | |
return | |
self.emit('&%s;' % name) | |
def handle_charref(self, name): | |
return self.handle_entityref('#%s' % name) | |
def close(self): | |
return ''.join(self.out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment