Created
August 31, 2016 14:55
-
-
Save kageurufu/47ed6c458aaee709e6de49ac5bb96c72 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
html_to_text | |
Parses and converts HTML to plain text with formatting to match the original intention | |
For example, the following HTML would be converted as follows | |
<html> | |
<head> | |
<style> | |
/* This should NOT be in the output */ | |
</style> | |
<body> | |
<p> | |
This is not a quote. | |
But it is a long paragraph with no line breaks. | |
</p> | |
<blockquote> | |
<p>This is a quote</p> | |
<p>As well as this</p> | |
<blockquote> | |
<p>Quoteception</p> | |
<blockquote> | |
<p>Quoteception</p> | |
<blockquote> | |
<p>Quoteception</p> | |
</blockquote> | |
</blockquote> | |
</blockquote> | |
</blockquote> | |
<ol> | |
<li>Test</li> | |
<li>Test</li> | |
<li>Test</li> | |
</ol> | |
<ul> | |
<li>Test</li> | |
<li>Test</li> | |
<li>Test</li> | |
</ul> | |
<p> | |
<a href="mailto:[email protected]">Mailto links should be included</a><br> | |
<a href="mailto:[email protected]">except where it would cause duplication such as [email protected]</a><br/> | |
<a href="http://comanage.com">But normal links should</a> | |
</p> | |
</body> | |
</html> | |
>>> print(html_to_text(__doc__[__doc__.find('<html>'):__doc__.find('</html>') + 7])) # doctest: +NORMALIZE_WHITESPACE | |
This is not a quote. But it is a long paragraph with no line breaks. | |
<BLANKLINE> | |
> This is a quote | |
> | |
> As well as this | |
> | |
> > Quoteception | |
> > | |
> > > Quoteception | |
> > > | |
> > > > Quoteception | |
<BLANKLINE> | |
0. Test | |
1. Test | |
2. Test | |
<BLANKLINE> | |
- Test | |
- Test | |
- Test | |
<BLANKLINE> | |
Mailto links should be included ( [email protected] ) | |
except where it would cause duplication such as [email protected] | |
But normal links should ( http://comanage.com ) | |
""" | |
from __future__ import unicode_literals | |
import sys | |
if sys.version_info < (3, 0, 0): | |
str = unicode | |
import re | |
from HTMLParser import HTMLParser | |
from htmlentitydefs import name2codepoint | |
class HTMLElement(list): | |
formatters = {} | |
@classmethod | |
def formats(cls, tag): | |
def decorator(func): | |
cls.formatters[tag] = func | |
return func | |
return decorator | |
def __init__(self, tag, attrs=None, contents=()): | |
self.tag = tag | |
self.attrs = attrs or [] | |
super(HTMLElement, self).__init__(contents) | |
@staticmethod | |
def format_plain(element): | |
return ''.join(str(s) for s in element) or '' | |
def __repr__(self): | |
if not self.tag: | |
return ' '.join(self) | |
args = dict(tag=self.tag, | |
attrs=(' ' + ' '.join('{}="{}"'.format(k, v) | |
for k, v in self.attrs) | |
if self.attrs else ''), | |
body='\n'.join(repr(s) for s in self)) | |
if self: | |
return "<{tag}{attrs}>\n{body}\n</{tag}>".format(**args) | |
return '<{tag}{attrs}/>'.format(**args) | |
def __str__(self): | |
# This is where the magic happens | |
return self.formatters.get(self.tag, HTMLElement.format_plain)(self) | |
@HTMLElement.formats('p') | |
def p(e): | |
return '\n{}\n'.format((''.join(str(s) for s in e)).strip()) | |
@HTMLElement.formats('blockquote') | |
def blockquote(e): | |
quote = ''.join(str(s) for s in e).strip() | |
quote = '\n'.join('> ' + s for s in quote.split("\n")) | |
return '\n{}\n'.format(quote) | |
@HTMLElement.formats('a') | |
def a(e): | |
href = next((v for k, v in e.attrs if k == 'href'), '') | |
body = HTMLElement.format_plain(e).strip() | |
if href.startswith('mailto'): | |
href = href.split(":", 1)[1] | |
if href in body: | |
return body | |
return HTMLElement.format_plain(e) + ' ( {} )'.format(href.strip()) | |
@HTMLElement.formats('ol') | |
def ol(e): | |
return ''.join('\n{}. {}'.format(i, s) for i, s in enumerate(u for u in e if isinstance(u, HTMLElement))) + '\n' | |
@HTMLElement.formats('ul') | |
def ul(e): | |
return ''.join('\n- {}'.format(s) for s in e if isinstance(s, HTMLElement)) + '\n' | |
@HTMLElement.formats('br') | |
def br(e): | |
return '\n' | |
@HTMLElement.formats('script') | |
@HTMLElement.formats('style') | |
def null(e): | |
return '' | |
@HTMLElement.formats('span') | |
@HTMLElement.formats(None) | |
def text(e): | |
return re.sub('\s+', ' ', ' '.join(str(s) for s in e)) | |
class HTMLToTextParser(HTMLParser): | |
def __init__(self, element_class=HTMLElement): | |
HTMLParser.__init__(self) | |
self.element_class = element_class | |
self.root = self.element_class("doc", []) | |
self.stack = [self.root] | |
@property | |
def top(self): | |
return self.stack[-1] | |
def handle_starttag(self, tag, attrs): | |
# Handle partial open br | |
if tag == 'br': | |
return self.handle_startendtag(tag, attrs) | |
new_elem = self.element_class(tag, attrs) | |
self.top.append(new_elem) | |
self.stack.append(new_elem) | |
def handle_startendtag(self, tag, attrs): | |
self.top.append(self.element_class(tag, attrs)) | |
def handle_endtag(self, tag): | |
self.stack.pop() | |
def handle_data(self, data): | |
if data.strip(): | |
self.top.append(self.element_class(None, [], [data])) | |
def handle_entityref(self, name): | |
if name in name2codepoint: | |
c = unichr(name2codepoint[name]) | |
self.top.append(c) | |
def handle_charref(self, name): | |
n = int(name[1:], 16) if name.startswith('x') else int(name) | |
self.top.append(unichr(n)) | |
def get_text(self): | |
return str(self.root).strip() | |
def html_to_text(html): | |
parser = HTMLToTextParser(HTMLElement) | |
parser.feed(html) | |
return parser.get_text() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment