Last active
December 26, 2015 11:49
-
-
Save gabalese/7146295 to your computer and use it in GitHub Desktop.
Splits a HTML file into several smaller files, given a tag and a (optional) class name.
Also fixes links in the same document.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
try: | |
import lxml.etree as ET | |
except ImportError: | |
from xml.etree import ElementTree as ET | |
from StringIO import StringIO | |
import sys | |
pub_id = "-//W3C//DTD XHTML 1.1//EN" | |
sys_url = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" | |
def emptyXHTML(): | |
doctype_string = '<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_url) | |
xml_header = '<?xml version="1.0"?>' | |
xhtml = xml_header + doctype_string + '<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>' | |
try: | |
ET.register_namespace('',"http://www.w3.org/1999/xhtml") | |
except ValueError: | |
pass | |
tree = ET.parse(StringIO(xhtml)) | |
return tree.getroot() | |
def split(file, tag, cssclass=None): | |
doc = ET.parse(file).getroot() | |
head = doc[0] | |
body = doc[1] | |
new_doc = emptyXHTML() | |
new_doc.insert(0,head) | |
for i in body: | |
if i.tag == "{http://www.w3.org/1999/xhtml}"+tag and (i.get("class", None) == cssclass if cssclass is not None else i.get("class",None)): | |
if len(new_doc[1]) != 0: | |
yield new_doc | |
new_doc = emptyXHTML() | |
new_doc.insert(0,head) | |
new_doc[1].append(i) | |
continue | |
else: | |
new_doc[1].append(i) | |
else: | |
yield new_doc | |
def foonotesfix(filelist): | |
""" | |
Could it be refactored in a truly functional way? | |
""" | |
for file_1 in filelist: | |
htmltree = ET.parse(file_1) | |
anchors = htmltree.findall('.//{http://www.w3.org/1999/xhtml}a') | |
if anchors: | |
for i in anchors: | |
target = i.get("href")[1:] | |
for file_2 in filelist: | |
tree = ET.parse(file_2).getroot() | |
x = tree.findall('.//*[@id="%s"]' % target) | |
if x: | |
y = x[0].get("href") | |
j = i.get("href") | |
i.set("href","%s%s" % (file_2,j)) | |
break | |
htmltree.write(file_1, xml_declaration=True, method="xml", encoding="utf-8") | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print "USAGE: python html_splitter.py <tag> [<class>]" | |
sys.exit(1) | |
docs = split(*sys.argv[1:]) | |
filelist = [] | |
for k,i in enumerate(docs): | |
filename = "%s_%05d.html" % (sys.argv[1], k) | |
filelist.append(filename) | |
ET.ElementTree(i).write(filename, xml_declaration=True, method="xml", encoding="utf-8") | |
else: | |
foonotesfix(filelist) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment