Last active
August 29, 2015 14:18
-
-
Save StevenMaude/452b592f99e04852e69f to your computer and use it in GitHub Desktop.
Clean up unwanted line breaks in HTML text; takes two arguments: input HTML filename and output name. (Uses lxml 3.4.2; later versions may be OK too.)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
from __future__ import (unicode_literals, print_function, | |
absolute_import, division) | |
import codecs | |
import re | |
import sys | |
import lxml.html | |
import lxml.html.clean | |
def read_file(input_filename): | |
""" Return content of file. """ | |
with codecs.open(input_filename, 'r', encoding='utf-8') as f: | |
return f.read() | |
def clean_html(root): | |
""" | |
Take root element and return a cleaned root element. | |
Removes styles, scripts, comments, links etc. from element | |
and its child elements. | |
See http://lxml.de/3.4/api/lxml.html.clean.Cleaner-class.html | |
""" | |
cleaner = lxml.html.clean.Cleaner(style=True) | |
cleaned_html = cleaner.clean_html(root) | |
for el in cleaned_html.xpath("*//p|//br"): | |
el.tail = "\n" + el.tail if el.tail else "\n" | |
return cleaned_html | |
def write_output(output_filename, element): | |
""" Write text from HTML element and all child elements to output file. """ | |
with codecs.open(output_filename, 'w', encoding='utf-8') as f: | |
f.write(element.text_content()) | |
def main(): | |
""" Read HTML file and output cleaned text from it. """ | |
content = read_file(sys.argv[1]) | |
new_content = re.sub(r'(\r\n|\n|\r)+', ' ', content) | |
root = lxml.html.fromstring(new_content) | |
cleaned_html = clean_html(root) | |
write_output(sys.argv[2], cleaned_html) | |
if __name__ == '__main__': | |
sys.stdout = codecs.getwriter('utf-8')(sys.stdout) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The line break handling is for HTML where line breaks are present in the text e.g.
I actually want the output text to be:
not
which is what you get if you just do
text_content
withlxml
.