Skip to content

Instantly share code, notes, and snippets.

@bradmontgomery
Last active April 1, 2019 18:30
Show Gist options
  • Save bradmontgomery/1065294 to your computer and use it in GitHub Desktop.
Save bradmontgomery/1065294 to your computer and use it in GitHub Desktop.
Quick & dirty HTML parsers: 1) Strip all the HTML out of a document, leaving just the document's content, 2) A way to remove all element attributes, leaving just clean markup.
import re
try:
from HTMLParser import HTMLParser # python 2
except ImportError:
from html.parser import HTMLParser # python 3
class HTMLStripperParser(HTMLParser):
"""Simple, stupid parser to remove all HTML tags from
a document. The point is to just get a the data.
>>> parser = HTMLStripperParser()
>>> parser.feed(string_with_html)
>>> parser.get_content()
"""
def get_content(self):
"Ignores all consecutive whitespace"
return re.sub("\s+", " ", self.content)
def handle_data(self, data):
if hasattr(self, 'content'):
self.content += data
else:
self.content = data
class AttributeStripper(HTMLParser):
"""A parser that strips out all element attributes.
Usage:
>>> html = open('index.html').read()
>>> parser = AttributeStripper()
>>> parser.feed(html)
>>> parser.write('new_file.html')
If you want to leave some tags as-is (say <a> elements), you can specify
the `keep_tags` argument:
>>> parser = AttributeStripper(keep_tags=['a'])
If you want to keep some attributes, specify them with the `keep_attrs`
argument:
>>> parser = AttributeStripper(keep_attrs=['href'])
If you want to completely exclude some elements from the document's body,
you can specify those with `omit_tags`:
>>> parser = AttributeStripper(omit_tags=['span'])
The above example will remove all <span> elements from the document.
"""
def __init__(self, *args, **kwargs):
self.elements = []
self.keep_attrs = kwargs.pop('keep_attrs', [])
self.keep_tags = kwargs.pop('keep_tags', [])
self.omit_tags = kwargs.pop('omit_tags', [])
super().__init__(*args, **kwargs)
def handle_endtag(self, tag):
if tag not in self.omit_tags:
self.elements.append("</{0}>".format(tag))
def _attrstring(self, attrs):
"""given a list of [(attr, value), ...], return a string of the
format: attr="value".
"""
attrs = ['{0}="{1}"'.format(attr, val) for attr, val in attrs]
return " ".join(attrs)
def handle_starttag(self, tag, attrs):
if tag in self.keep_tags:
# we want to keep this on as-is
self.elements.append("<{0} {1}>".format(tag, self._attrstring(attrs)))
if tag not in self.omit_tags:
items = [] # attributes we plan to keep
for attr, value in attrs:
if attr in self.keep_attrs:
items.append((attr, value))
if items:
self.elements.append("<{0} {1}>".format(tag, self._attrstring(items)))
else:
self.elements.append("<{0}>".format(tag))
def handle_data(self, data):
if data:
self.elements.append(data.strip())
@property
def cleaned_content(self):
return "\n".join(self.elements)
def write(self, filename):
with open(filename, "w+") as f:
f.write(self.cleaned_content)
def strip_attributes(input_file, output_file,
keep_tags=None, keep_attrs=None, omit_tags=None):
"""This a convenient function for using the AttributeStriper class."""
keep_tags = keep_tags or []
keep_attrs = keep_attrs or []
omit_tags = omit_tags or []
parser = AttributeStripper(
keep_tags=keep_tags,
keep_attrs=keep_attrs,
omit_tags=omit_tags
)
parser.feed(open(input_file).read())
parser.write(output_file)
if __name__ == "__main__":
from sys import argv
if len(argv) == 3:
input_file = argv[1]
output_file = argv[2]
# Clean a Word (docx)-generated html file
strip_attributes(
input_file,
output_file,
keep_tags=['body', 'p', 'a'],
keep_attrs=[],
omit_tags=['meta', 'link', 'style', 'div', 'span'],
)
print("DONE! You may now want to run:\n\ttidy -o {} {}".format(
output_file, output_file))
else:
print("\nUSAGE: python stripperparser <input> <output>\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment