Last active
April 1, 2019 18:30
-
-
Save bradmontgomery/1065294 to your computer and use it in GitHub Desktop.
Quick & dirty HTML parsers: 1) Strip all the HTML out of a document, leaving just the document's content, 2) A way to remove all element attributes, leaving just clean markup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
try: | |
from HTMLParser import HTMLParser # python 2 | |
except ImportError: | |
from html.parser import HTMLParser # python 3 | |
class HTMLStripperParser(HTMLParser): | |
"""Simple, stupid parser to remove all HTML tags from | |
a document. The point is to just get a the data. | |
>>> parser = HTMLStripperParser() | |
>>> parser.feed(string_with_html) | |
>>> parser.get_content() | |
""" | |
def get_content(self): | |
"Ignores all consecutive whitespace" | |
return re.sub("\s+", " ", self.content) | |
def handle_data(self, data): | |
if hasattr(self, 'content'): | |
self.content += data | |
else: | |
self.content = data | |
class AttributeStripper(HTMLParser): | |
"""A parser that strips out all element attributes. | |
Usage: | |
>>> html = open('index.html').read() | |
>>> parser = AttributeStripper() | |
>>> parser.feed(html) | |
>>> parser.write('new_file.html') | |
If you want to leave some tags as-is (say <a> elements), you can specify | |
the `keep_tags` argument: | |
>>> parser = AttributeStripper(keep_tags=['a']) | |
If you want to keep some attributes, specify them with the `keep_attrs` | |
argument: | |
>>> parser = AttributeStripper(keep_attrs=['href']) | |
If you want to completely exclude some elements from the document's body, | |
you can specify those with `omit_tags`: | |
>>> parser = AttributeStripper(omit_tags=['span']) | |
The above example will remove all <span> elements from the document. | |
""" | |
def __init__(self, *args, **kwargs): | |
self.elements = [] | |
self.keep_attrs = kwargs.pop('keep_attrs', []) | |
self.keep_tags = kwargs.pop('keep_tags', []) | |
self.omit_tags = kwargs.pop('omit_tags', []) | |
super().__init__(*args, **kwargs) | |
def handle_endtag(self, tag): | |
if tag not in self.omit_tags: | |
self.elements.append("</{0}>".format(tag)) | |
def _attrstring(self, attrs): | |
"""given a list of [(attr, value), ...], return a string of the | |
format: attr="value". | |
""" | |
attrs = ['{0}="{1}"'.format(attr, val) for attr, val in attrs] | |
return " ".join(attrs) | |
def handle_starttag(self, tag, attrs): | |
if tag in self.keep_tags: | |
# we want to keep this on as-is | |
self.elements.append("<{0} {1}>".format(tag, self._attrstring(attrs))) | |
if tag not in self.omit_tags: | |
items = [] # attributes we plan to keep | |
for attr, value in attrs: | |
if attr in self.keep_attrs: | |
items.append((attr, value)) | |
if items: | |
self.elements.append("<{0} {1}>".format(tag, self._attrstring(items))) | |
else: | |
self.elements.append("<{0}>".format(tag)) | |
def handle_data(self, data): | |
if data: | |
self.elements.append(data.strip()) | |
@property | |
def cleaned_content(self): | |
return "\n".join(self.elements) | |
def write(self, filename): | |
with open(filename, "w+") as f: | |
f.write(self.cleaned_content) | |
def strip_attributes(input_file, output_file, | |
keep_tags=None, keep_attrs=None, omit_tags=None): | |
"""This a convenient function for using the AttributeStriper class.""" | |
keep_tags = keep_tags or [] | |
keep_attrs = keep_attrs or [] | |
omit_tags = omit_tags or [] | |
parser = AttributeStripper( | |
keep_tags=keep_tags, | |
keep_attrs=keep_attrs, | |
omit_tags=omit_tags | |
) | |
parser.feed(open(input_file).read()) | |
parser.write(output_file) | |
if __name__ == "__main__": | |
from sys import argv | |
if len(argv) == 3: | |
input_file = argv[1] | |
output_file = argv[2] | |
# Clean a Word (docx)-generated html file | |
strip_attributes( | |
input_file, | |
output_file, | |
keep_tags=['body', 'p', 'a'], | |
keep_attrs=[], | |
omit_tags=['meta', 'link', 'style', 'div', 'span'], | |
) | |
print("DONE! You may now want to run:\n\ttidy -o {} {}".format( | |
output_file, output_file)) | |
else: | |
print("\nUSAGE: python stripperparser <input> <output>\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment