-
-
Save jrial/bed3cd6fa2806af3a048a74208963d63 to your computer and use it in GitHub Desktop.
Extract text from html in python using BeautifulSoup4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
from bs4 import BeautifulSoup, NavigableString, Tag | |
def html_to_text(html): | |
"Creates a formatted text email message as a string from a rendered html template (page)" | |
soup = BeautifulSoup(html, 'html.parser') | |
# Ignore anything in head | |
body, text = soup.body, [] | |
# h1, h2 etc... | |
heading_re = re.compile('^h\d+$') | |
add_newline = False | |
for element in body.descendants: | |
# We use type and not isinstance since comments, cdata, etc are subclasses that we don't want | |
# pylint: disable=C0123 | |
if type(element) == Tag: | |
if element.name == 'p' or heading_re.match(element.name): | |
# Mark as "requires leading newline": paragraphs, headings | |
add_newline = True | |
if type(element) == NavigableString: | |
parent_tags = (t for t in element.parents if type(t) == Tag) | |
hidden = False | |
for parent_tag in parent_tags: | |
# Ignore any text inside a non-displayed tag | |
# We also behave is if scripting is enabled (noscript is ignored) | |
# The list of non-displayed tags and attributes from the W3C specs: | |
if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link', | |
'meta', 'noembed', 'noframes', 'param', 'rp', 'script', | |
'source', 'style', 'template', 'track', 'title', 'noscript') or | |
parent_tag.has_attr('hidden') or | |
(parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')): | |
hidden = True | |
break | |
if hidden: | |
continue | |
# remove any multiple and leading/trailing whitespace | |
string = ' '.join(element.string.split()) | |
if string: | |
if element.parent.name == 'a': | |
a_tag = element.parent | |
# replace link text with the Markdown link | |
string = u'[{}]({})'.format(element.strip(), a_tag['href']) | |
# concatenate with any non-empty immediately previous string | |
if (type(a_tag.previous_sibling) == NavigableString and | |
a_tag.previous_sibling.string.strip()): | |
if text[-1][-1] in """([{"'`""": | |
text[-1] += string | |
else: | |
text[-1] = text[-1] + ' ' + string | |
continue | |
elif element.previous_sibling and element.previous_sibling.name == 'a': | |
# Don't put spaces before punctuation and similar stuff | |
if string[0] in """,.!?;:)]}"'`""": | |
text[-1] += string | |
else: | |
text[-1] = text[-1] + ' ' + string | |
continue | |
if add_newline and text: | |
# Add extra paragraph/heading formatting newline, except | |
# at the very beginning of the document. | |
string = '\n' + string | |
add_newline = False | |
text += [string] | |
doc = '\n'.join(text) | |
return doc | |
if __name__ == '__main__': | |
html = """ | |
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> | |
<html lang="en"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> | |
<title>Hello World!</title> | |
</head> | |
<body style="margin:0; padding:0; background-color:#F2F2F2;"> | |
<!--[if !mso]><!-- --> | |
<img style="min-width:640px; display:block; margin:0; padding:0" class="mobileOff" width="640" height="1" src="/static/spacer.gif"> | |
<!--<![endif]--> | |
<center> | |
<table width="100%" border="0" cellpadding="0" cellspacing="0" bgcolor="#F2F2F2"> | |
<tr> | |
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;"> | |
This is some title text. | |
</td> | |
</tr> | |
<script>This is a script</script> | |
<tr> | |
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;"> | |
<p> Paragraph without | |
link <br> But with a | |
line break </p> | |
</td> | |
</tr> | |
<tr> | |
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;"> | |
<a href="http://www.dummy-domain.co.wibble/button-link/">This is a button link ></a> | |
</td> | |
</tr> | |
<style type="text/css"> | |
/* CLIENT-SPECIFIC STYLES */ | |
body, table, td, a { -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; } | |
table, td { mso-table-lspace: 0pt; mso-table-rspace: 0pt; } | |
img { -ms-interpolation-mode: bicubic; } | |
</style> | |
<script>This is a longer script with embedded tags: | |
'<p>Example embedded tag with <i class="fa fa-example">icon</i></p>' | |
</script> | |
<p hidden>Non-visible paragraph with <i class="fa fa-example">icon</i></p> | |
<noscript>This is a longer script with embedded tags: | |
<p>Example embedded text with <i class="fa fa-example">icon</i></p> | |
</noscript> | |
<form> | |
<input id="id_wibble" class="form-control" name="wibble" type="hidden" placeholder="Something here"> | |
<input id="id_email" class="form-control" name="email" type="email" placeholder="Your email address"> | |
</form> | |
<tr> | |
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;"> | |
<p>Paragraph with embedded link <a href="http://www.dummy-domain.co.wibble/paragraph-link/">This is a link ></a> | |
and this is a continuation of the paragraph with the link.</p> | |
</td> | |
</tr> | |
<tr> | |
<td align="center" class="mobile" style="font-family:arial, sans-serif; font-size:20px; line-height:26px; font-weight:bold;"> | |
Some text with link: <a href="http://www.dummy-domain.co.wibble/text-link/">This is a link ></a> | |
And some text after the link.<br> | |
Try an empty embedded link<a href="">This is a link ></a>before this text.<br> | |
Lots of brs:<br><br><br> | |
after brs | |
</td> | |
</tr> | |
<tr> | |
<td> | |
<p><a href="https://www.dummy-domain.co.wibble/">This is a link that starts a paragraph</a>, and this is the paragraph's continuation.</p> | |
</td> | |
</tr> | |
<tr> | |
<td> | |
<p> | |
Let's try some <a href="#punctuation">links</a>, followed by | |
punctuation {<a href="#braces">or between braces</a>}, | |
(<a href="#brackets">brackets</a>), | |
"<a href="#quotes">inside quotes</a>", | |
`<a href="#backticks">backticks</a>` | |
and observe correct text flow. | |
</td> | |
</tr> | |
</table> | |
</center> | |
</body> | |
</html> | |
""" | |
print(html_to_text(html)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment