Last active
July 9, 2018 05:54
-
-
Save randomradio/153161b135707288714dffa01b0046db to your computer and use it in GitHub Desktop.
Google document cleanup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import cssutils | |
import logging | |
import urlparse | |
class HTML_cleaner(object): | |
soup = None; | |
highlighting_selectors = {} | |
def __init__(self, html_str, parser="html.parser"): | |
self.soup = bs4.BeautifulSoup(html_str, parser) | |
def clean_up(self, steps=[]): | |
""" Cleaup a beautiful soup object with given steps, return cleaned up soup | |
kwargs: | |
steps -- steps defined for cleanup, if empty function will run through all steps | |
""" | |
if not steps: | |
self.collect_styles() | |
self.remove_unused_classes(preserved=self.highlighting_selectors.keys()) | |
self.remove_empty_elements() | |
self.fix_heading_strongs() | |
self.unwrap_valina_spans() | |
self.remove_link_redirects() | |
else: | |
for step in steps: | |
try: | |
self.soup = getattr(HTML_cleaner, step) | |
except AttributeError: | |
logging.warn("Step not found") | |
return self.soup | |
def collect_styles(self): | |
""" find all highlighted text in document and remove style tags | |
return class name and hex color dictionary | |
""" | |
highlighting_selectors = {} | |
for style_tag in self.soup.find_all('style', type='text/css'): | |
# print "STYLE_TAG: {}".format(style_tag) | |
if not style_tag.string: | |
continue | |
# print "STYLE_TAG.STRING: {}".format(style_tag.string) | |
sheet = cssutils.parseString(style_tag.string) | |
for rule in sheet: | |
# print "RULE: {}".format(rule) | |
if rule.type == cssutils.css.CSSRule.STYLE_RULE: | |
for selector in rule.selectorList: | |
if len(selector.selectorText.split(',')) == 1 and rule.style.length == 1: | |
if rule.style.keys()[0] == 'background-color': | |
key_name = rule.style.keys()[0] | |
hex_rule = rule.style.getProperty(key_name) | |
highlighting_selectors[selector.selectorText.split(',')[0]] = hex_rule.value | |
style_tag.decompose() | |
self.highlighting_selectors = highlighting_selectors | |
return highlighting_selectors | |
def remove_ids(self): | |
pass | |
def remove_unused_classes(self, preserved=[]): | |
for tag in self.soup.findAll(True): | |
if 'class' in tag.attrs.keys(): | |
tag_classes = tag.attrs['class'] | |
intersect_class_name = filter(lambda x: '.%s'%x in preserved, tag_classes) | |
if len(intersect_class_name) == 0: | |
del tag['class'] | |
def remove_empty_elements(self): | |
for el in self.soup.find_all(): | |
if len(el.get_text().strip()) == 0: | |
el.extract() | |
if el.is_empty_tag: | |
el.extract() | |
def unwrap_valina_spans(self): | |
spans = self.soup.find_all('span') | |
for span in spans: | |
if 'class' not in span.attrs.keys(): | |
span.unwrap() | |
def remove_link_redirects(self): | |
a_tags = self.soup.find_all('a') | |
for a in a_tags: | |
href = a.get('href') | |
if href.startswith('https://www.google.com/url?q='): | |
parsed = urlparse.urlparse(href) | |
parsed_q = urlparse.parse_qs(parsed.query)['q'] | |
if len(parsed_q) > 0: | |
href = parsed_q[0] | |
a['href'] = href | |
def fix_heading_strongs(self): | |
"""If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight.""" | |
headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
# Double next strong tags often appear. Repeat until no results found. | |
stripped = True | |
while stripped: | |
stripped = False | |
for heading in headings: | |
if len(heading.contents) == 1 and heading.contents[0].name == 'strong': | |
heading.contents[0].unwrap() | |
stripped = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import cssutils | |
import urlparse | |
import logging | |
from HTMLParser import HTMLParser | |
ALLOWED_EMPTY_TAGS = ['td', 'br'] | |
class BaseSanitizer(object): | |
def __init__(self, raw_html): | |
raw_html = '' if raw_html is None else raw_html | |
raw_html = raw_html.replace(' ', ' ') | |
raw_html = raw_html.replace('<br>', '<br/>') | |
# # try to unescape content before creating soup | |
# try: | |
# html_parser = HTMLParser() | |
# raw_html = html_parser.unescape( | |
# raw_html.decode('utf-8') | |
# ) | |
# except Exception as e: | |
# logging.error(e) | |
self.soup = bs4.BeautifulSoup(raw_html, 'html.parser') | |
def sanitize(self): | |
self.strip_styles() | |
self.strip_unused_spans() | |
self.strip_comments() | |
self.filter_html_remove_empty_tags() | |
self.remove_element_ids() | |
self.remove_element_classes() | |
self.remove_link_redirects() | |
self.fix_heading_strongs() | |
return self.soup | |
def remove_link_redirects(self): | |
a_tags = self.soup.find_all('a') | |
for a in a_tags: | |
href = a.get('href') | |
if href.startswith('https://www.google.com/url?q='): | |
parsed = urlparse.urlparse(href) | |
parsed_q = urlparse.parse_qs(parsed.query)['q'] | |
if len(parsed_q) > 0: | |
href = parsed_q[0] | |
a['href'] = href | |
def fix_heading_strongs(self): | |
"""If a strong tag is a heading's only child, strip it to rely on the CSS heading style for consistent font weight.""" | |
headings = self.soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
# Double next strong tags often appear. Repeat until no results found. | |
stripped = True | |
while stripped: | |
stripped = False | |
for heading in headings: | |
if len(heading.contents) == 1 and heading.contents[0].name == 'strong': | |
heading.contents[0].unwrap() | |
stripped = True | |
def strip_unused_spans(self): | |
spans = self.soup.find_all('span') | |
for span in spans: | |
if len(span.get_text().strip()) == 0: | |
span.decompose() | |
else: | |
span.unwrap() | |
def strip_comments(self): | |
divs = self.soup.find_all('div') | |
for div in divs: | |
div.decompose() | |
sups = self.soup.find_all('sup') | |
for sup in sups: | |
sup.decompose() | |
def strip_styles(self): | |
del self.soup['style'] | |
# find any style sheets so we can find bold/italics classes | |
bold_selectors = [] | |
italic_selectors = [] | |
for style_tag in self.soup.find_all('style', type='text/css'): | |
# print "STYLE_TAG: {}".format(style_tag) | |
if not style_tag.string: | |
continue | |
# print "STYLE_TAG.STRING: {}".format(style_tag.string) | |
sheet = cssutils.parseString(style_tag.string) | |
for rule in sheet: | |
# print "RULE: {}".format(rule) | |
if rule.type == rule.STYLE_RULE: | |
for property in rule.style: | |
# print " PROPERTY: {}".format(property) | |
if property.name == 'font-weight' and property.value == '700': | |
bold_selector = rule.selectorText.replace('.', '') | |
if bold_selector[0:1] != 'h': # skip headers | |
# print " BOLD FOUND! SELECTOR={}".format(bold_selector) | |
bold_selectors.append(bold_selector) | |
break | |
elif property.name == 'font-style' and property.value == 'italic': | |
italic_selector = rule.selectorText.replace('.', '') | |
if italic_selector[0:1] != 'h': # skip headers | |
# print " ITALIC FOUND! SELECTOR={}".format(italic_selector) | |
italic_selectors.append(italic_selector) | |
break | |
bold_selector_set = set(bold_selectors) | |
italic_selector_set = set(italic_selectors) | |
# print ('bold_selectors: {}'.format(bold_selector_set)) | |
# print ('italic_selectors: {}'.format(italic_selector_set)) | |
els = self.soup.find_all() | |
for el in els: | |
style = el.get('style') | |
current_tag_name = el.name | |
classs = el.get('class') | |
if classs: | |
class_set = set(classs) | |
else: | |
class_set = set([]) | |
if (style and 'font-weight:700' in style) or class_set.intersection(bold_selector_set): | |
el.name = 'strong' | |
strong_tag = self.soup.new_tag(current_tag_name) | |
el.wrap(strong_tag) | |
elif (style and 'font-style:italic' in style) or class_set.intersection(italic_selector_set): | |
el.name = 'em' | |
em_tag = self.soup.new_tag(current_tag_name) | |
el.wrap(em_tag) | |
del el['style'] | |
def filter_html_remove_empty_tags(self): | |
"""Strip strong tags that wrap an entire header's text.""" | |
# Removing an empty child sometimes creates an empty parent | |
# Repeat until no results found | |
stripped = True | |
while stripped: | |
stripped = False | |
for element in self.soup.find_all(): | |
if len(element.get_text().strip()) == 0: | |
contents = [c for c in element.contents if c] | |
if element.name in ALLOWED_EMPTY_TAGS: | |
element.string = '\007' | |
if not contents and element.name not in ALLOWED_EMPTY_TAGS: | |
element.decompose() | |
stripped = True | |
def remove_element_ids(self): | |
all_nodes = self.soup.find_all(True) | |
for node in all_nodes: | |
if node.get('id'): | |
del node['id'] | |
def remove_element_classes(self): | |
for tag in self.soup.findAll(True): | |
# remove class, set it to empty | |
setattr(tag, 'class', []) | |
def remove_trailing_brs(self): | |
all_nodes = self.soup.find_all('br') | |
for node in all_nodes: | |
sub_node = node.find_all('br') | |
for node_in_node in sub_node: | |
node_in_node.unwrap() | |
if __name__ == '__main__': | |
html_file = open("./renderer_test_data/summary.html") | |
doc = html_file.read() | |
sanitizer = BaseSanitizer(doc) | |
clean_soup = sanitizer.sanitize() | |
print clean_soup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment