Skip to content

Instantly share code, notes, and snippets.

@jrial
Forked from racitup/html_to_text.py
Last active April 18, 2018 13:04
Show Gist options
  • Save jrial/bed3cd6fa2806af3a048a74208963d63 to your computer and use it in GitHub Desktop.
Save jrial/bed3cd6fa2806af3a048a74208963d63 to your computer and use it in GitHub Desktop.
Extract text from html in python using BeautifulSoup4
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup, NavigableString, Tag
def html_to_text(html):
"Creates a formatted text email message as a string from a rendered html template (page)"
soup = BeautifulSoup(html, 'html.parser')
# Ignore anything in head
body, text = soup.body, []
# h1, h2 etc...
heading_re = re.compile('^h\d+$')
add_newline = False
for element in body.descendants:
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
# pylint: disable=C0123
if type(element) == Tag:
if element.name == 'p' or heading_re.match(element.name):
# Mark as "requires leading newline": paragraphs, headings
add_newline = True
if type(element) == NavigableString:
parent_tags = (t for t in element.parents if type(t) == Tag)
hidden = False
for parent_tag in parent_tags:
# Ignore any text inside a non-displayed tag
# We also behave is if scripting is enabled (noscript is ignored)
# The list of non-displayed tags and attributes from the W3C specs:
if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link',
'meta', 'noembed', 'noframes', 'param', 'rp', 'script',
'source', 'style', 'template', 'track', 'title', 'noscript') or
parent_tag.has_attr('hidden') or
(parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')):
hidden = True
break
if hidden:
continue
# remove any multiple and leading/trailing whitespace
string = ' '.join(element.string.split())
if string:
if element.parent.name == 'a':
a_tag = element.parent
# replace link text with the Markdown link
string = u'[{}]({})'.format(element.strip(), a_tag['href'])
# concatenate with any non-empty immediately previous string
if (type(a_tag.previous_sibling) == NavigableString and
a_tag.previous_sibling.string.strip()):
if text[-1][-1] in """([{"'`""":
text[-1] += string
else:
text[-1] = text[-1] + ' ' + string
continue
elif element.previous_sibling and element.previous_sibling.name == 'a':
# Don't put spaces before punctuation and similar stuff
if string[0] in """,.!?;:)]}"'`""":
text[-1] += string
else:
text[-1] = text[-1] + ' ' + string
continue
if add_newline and text:
# Add extra paragraph/heading formatting newline, except
# at the very beginning of the document.
string = '\n' + string
add_newline = False
text += [string]
doc = '\n'.join(text)
return doc
if __name__ == '__main__':
html = """
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Hello World!</title>
</head>
<body style="margin:0; padding:0; background-color:#F2F2F2;">
<!--[if !mso]><!-- -->
<img style="min-width:640px; display:block; margin:0; padding:0" class="mobileOff" width="640" height="1" src="/static/spacer.gif">
<!--<![endif]-->
<center>
<table width="100%" border="0" cellpadding="0" cellspacing="0" bgcolor="#F2F2F2">
<tr>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
This is some title text.
</td>
</tr>
<script>This is a script</script>
<tr>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
<p> Paragraph without
link <br> But with a
line break </p>
</td>
</tr>
<tr>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
<a href="http://www.dummy-domain.co.wibble/button-link/">This is a button link &gt;</a>
</td>
</tr>
<style type="text/css">
/* CLIENT-SPECIFIC STYLES */
body, table, td, a { -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }
table, td { mso-table-lspace: 0pt; mso-table-rspace: 0pt; }
img { -ms-interpolation-mode: bicubic; }
</style>
<script>This is a longer script with embedded tags:
'<p>Example embedded tag with <i class="fa fa-example">icon</i></p>'
</script>
<p hidden>Non-visible paragraph with <i class="fa fa-example">icon</i></p>
<noscript>This is a longer script with embedded tags:
<p>Example embedded text with <i class="fa fa-example">icon</i></p>
</noscript>
<form>
<input id="id_wibble" class="form-control" name="wibble" type="hidden" placeholder="Something here">
<input id="id_email" class="form-control" name="email" type="email" placeholder="Your email address">
</form>
<tr>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
<p>Paragraph with embedded link <a href="http://www.dummy-domain.co.wibble/paragraph-link/">This is a link &gt;</a>
and this is a continuation of the paragraph with the link.</p>
</td>
</tr>
<tr>
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;">
Some text with link: <a href="http://www.dummy-domain.co.wibble/text-link/">This is a link &gt;</a>
And some text after the link.<br>
Try an empty embedded link<a href="">This is a link &gt;</a>before this text.<br>
Lots of brs:<br><br><br>
after brs
</td>
</tr>
<tr>
<td>
<p><a href="https://www.dummy-domain.co.wibble/">This is a link that starts a paragraph</a>, and this is the paragraph's continuation.</p>
</td>
</tr>
<tr>
<td>
<p>
Let's try some <a href="#punctuation">links</a>, followed by
punctuation {<a href="#braces">or between braces</a>},
(<a href="#brackets">brackets</a>),
&quot;<a href="#quotes">inside quotes</a>&quot;,
`<a href="#backticks">backticks</a>`
and observe correct text flow.
</td>
</tr>
</table>
</center>
</body>
</html>
"""
print(html_to_text(html))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment