bradmontgomery · April 1, 2019 18:30
diff --git a/stripperparser.py b/stripperparser.py
 import re
 try:
    from HTMLParser import HTMLParser  # python 2
 except ImportError:
    from html.parser import HTMLParser  # python 3


 class HTMLStripperParser(HTMLParser):
    """Simple, stupid parser to remove all HTML tags from
    a document. The point is to just get a the data.

    >>> parser = HTMLStripperParser()
    >>> parser.feed(string_with_html)
    >>> parser.get_content()

    """

    def get_content(self):
        "Ignores all consecutive whitespace"
        return re.sub("\s+", " ", self.content)

    def handle_data(self, data):
        if hasattr(self, 'content'):
            self.content += data
        else:
            self.content = data


 class AttributeStripper(HTMLParser):
    """A parser that strips out all element attributes.

    Usage:

        >>> html = open('index.html').read()
        >>> parser = AttributeStripper()
        >>> parser.feed(html)
        >>> parser.write('new_file.html')

    If you want to leave some tags as-is (say <a> elements), you can specify
    the `keep_tags` argument:

        >>> parser = AttributeStripper(keep_tags=['a'])

    If you want to keep some attributes, specify them with the `keep_attrs`
    argument:

        >>> parser = AttributeStripper(keep_attrs=['href'])

    If you want to completely exclude some elements from the document's body,
    you can specify those with `omit_tags`:

        >>> parser = AttributeStripper(omit_tags=['span'])

    The above example will remove all <span> elements from the document.

    """

    def __init__(self, *args, **kwargs):
        self.elements = []
        self.keep_attrs = kwargs.pop('keep_attrs', [])
        self.keep_tags = kwargs.pop('keep_tags', [])
        self.omit_tags = kwargs.pop('omit_tags', [])
        super().__init__(*args, **kwargs)

    def handle_endtag(self, tag):
        if tag not in self.omit_tags:
            self.elements.append("</{0}>".format(tag))

    def _attrstring(self, attrs):
        """given a list of [(attr, value), ...], return a string of the
        format: attr="value".
        """
        attrs = ['{0}="{1}"'.format(attr, val) for attr, val in attrs]
        return " ".join(attrs)

    def handle_starttag(self, tag, attrs):
        if tag in self.keep_tags:
            # we want to keep this on as-is
            self.elements.append("<{0} {1}>".format(tag, self._attrstring(attrs)))

        if tag not in self.omit_tags:
            items = []  # attributes we plan to keep
            for attr, value in attrs:
                if attr in self.keep_attrs:
                    items.append((attr, value))
            if items:
                self.elements.append("<{0} {1}>".format(tag, self._attrstring(items)))
            else:
                self.elements.append("<{0}>".format(tag))

    def handle_data(self, data):
        if data:
            self.elements.append(data.strip())

    @property
    def cleaned_content(self):
        return "\n".join(self.elements)

    def write(self, filename):
        with open(filename, "w+") as f:
            f.write(self.cleaned_content)


 def strip_attributes(input_file, output_file,
                     keep_tags=None, keep_attrs=None, omit_tags=None):
    """This a convenient function for using the AttributeStriper class."""
    keep_tags = keep_tags or []
    keep_attrs = keep_attrs or []
    omit_tags = omit_tags or []

    parser = AttributeStripper(
        keep_tags=keep_tags,
        keep_attrs=keep_attrs,
        omit_tags=omit_tags
    )
    parser.feed(open(input_file).read())
    parser.write(output_file)


 if __name__ == "__main__":

    from sys import argv
    if len(argv) == 3:
        input_file = argv[1]
        output_file = argv[2]

        # Clean a Word (docx)-generated html file
        strip_attributes(
            input_file,
            output_file,
            keep_tags=['body', 'p', 'a'],
            keep_attrs=[],
            omit_tags=['meta', 'link', 'style', 'div', 'span'],
        )

        print("DONE! You may now want to run:\n\ttidy -o {} {}".format(
            output_file, output_file))

    else:
        print("\nUSAGE: python stripperparser <input> <output>\n")
	import re
	try:
	from HTMLParser import HTMLParser # python 2
	except ImportError:
	from html.parser import HTMLParser # python 3


	class HTMLStripperParser(HTMLParser):
	"""Simple, stupid parser to remove all HTML tags from
	a document. The point is to just get a the data.

	>>> parser = HTMLStripperParser()
	>>> parser.feed(string_with_html)
	>>> parser.get_content()

	"""

	def get_content(self):
	"Ignores all consecutive whitespace"
	return re.sub("\s+", " ", self.content)

	def handle_data(self, data):
	if hasattr(self, 'content'):
	self.content += data
	else:
	self.content = data


	class AttributeStripper(HTMLParser):
	"""A parser that strips out all element attributes.

	Usage:

	>>> html = open('index.html').read()
	>>> parser = AttributeStripper()
	>>> parser.feed(html)
	>>> parser.write('new_file.html')

	If you want to leave some tags as-is (say <a> elements), you can specify
	the `keep_tags` argument:

	>>> parser = AttributeStripper(keep_tags=['a'])

	If you want to keep some attributes, specify them with the `keep_attrs`
	argument:

	>>> parser = AttributeStripper(keep_attrs=['href'])

	If you want to completely exclude some elements from the document's body,
	you can specify those with `omit_tags`:

	>>> parser = AttributeStripper(omit_tags=['span'])

	The above example will remove all <span> elements from the document.

	"""

	def __init__(self, args, *kwargs):
	self.elements = []
	self.keep_attrs = kwargs.pop('keep_attrs', [])
	self.keep_tags = kwargs.pop('keep_tags', [])
	self.omit_tags = kwargs.pop('omit_tags', [])
	super().__init__(args, *kwargs)

	def handle_endtag(self, tag):
	if tag not in self.omit_tags:
	self.elements.append("</{0}>".format(tag))

	def _attrstring(self, attrs):
	"""given a list of [(attr, value), ...], return a string of the
	format: attr="value".
	"""
	attrs = ['{0}="{1}"'.format(attr, val) for attr, val in attrs]
	return " ".join(attrs)

	def handle_starttag(self, tag, attrs):
	if tag in self.keep_tags:
	# we want to keep this on as-is
	self.elements.append("<{0} {1}>".format(tag, self._attrstring(attrs)))

	if tag not in self.omit_tags:
	items = [] # attributes we plan to keep
	for attr, value in attrs:
	if attr in self.keep_attrs:
	items.append((attr, value))
	if items:
	self.elements.append("<{0} {1}>".format(tag, self._attrstring(items)))
	else:
	self.elements.append("<{0}>".format(tag))

	def handle_data(self, data):
	if data:
	self.elements.append(data.strip())

	@property
	def cleaned_content(self):
	return "\n".join(self.elements)

	def write(self, filename):
	with open(filename, "w+") as f:
	f.write(self.cleaned_content)


	def strip_attributes(input_file, output_file,
	keep_tags=None, keep_attrs=None, omit_tags=None):
	"""This a convenient function for using the AttributeStriper class."""
	keep_tags = keep_tags or []
	keep_attrs = keep_attrs or []
	omit_tags = omit_tags or []

	parser = AttributeStripper(
	keep_tags=keep_tags,
	keep_attrs=keep_attrs,
	omit_tags=omit_tags
	)
	parser.feed(open(input_file).read())
	parser.write(output_file)


	if __name__ == "__main__":

	from sys import argv
	if len(argv) == 3:
	input_file = argv[1]
	output_file = argv[2]

	# Clean a Word (docx)-generated html file
	strip_attributes(
	input_file,
	output_file,
	keep_tags=['body', 'p', 'a'],
	keep_attrs=[],
	omit_tags=['meta', 'link', 'style', 'div', 'span'],
	)

	print("DONE! You may now want to run:\n\ttidy -o {} {}".format(
	output_file, output_file))

	else:
	print("\nUSAGE: python stripperparser <input> <output>\n")