Created
September 11, 2012 11:35
-
-
Save tejastank/3697749 to your computer and use it in GitHub Desktop.
python HTML2TEXT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class HTML2TEXTStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def handle_entityref(self, name): | |
c = '' | |
if name in ['gt','lt','nbsp','amp','quot','apos']: | |
c = self.unescape("&%s;" % name) | |
else: | |
c = self.unescape("&%s" % name) | |
self.fed.append(c) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(html): | |
s = HTML2TEXTStripper() | |
s.feed(html) | |
return s.get_data() | |
value = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
<html xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> | |
<script></script> | |
<title>Our new summer product line!</title> | |
</head> | |
<body><p>Hello Cesar Pickney,</p> | |
<p><a href="http://demo.sugarondemand.com/joey_vert/seed1/template60/index.php?entryPoint=campaign_trackerv2&track=ab8dc525-bc09-d94f-3dac-4bb58b43d594&identifier=62038828-d46d-6f2e-fc82-4c33f76d13bd">http://demo.sugarondemand.com/joey_vert/seed1/template60/index.php?entryPoint=campaign_trackerv2&track=ab8dc525-bc09-d94f-3dac-4bb58b43d594&identifier=62038828-d46d-6f2e-fc82-4c33f76d13bd </a></p> | |
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p> | |
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p> | |
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p> | |
<p>- Sincerely</p> | |
<p>SugarCRM</p> | |
<p> </p> | |
<p><a href="http://demo.sugarondemand.com/joey_vert/seed1/template60/index.php?entryPoint=removeme&identifier=62038828-d46d-6f2e-fc82-4c33f76d13bd"> http://demo.sugarondemand.com/joey_vert/seed1/template60/index.php?entryPoint=removeme&identifier=62038828-d46d-6f2e-fc82-4c33f76d13bd </a></p> | |
<p> </p> | |
<p> </p><br><IMG HEIGHT='1' WIDTH='1' src='http://demo.sugarondemand.com/joey_vert/seed1/template60/index.php?entryPoint=image&identifier=62038828-d46d-6f2e-fc82-4c33f76d13bd'></body></html> | |
<table><tr><td>3333333333333333</td></tr></table> & | |
""" | |
print strip_tags(value) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment