Skip to content

Instantly share code, notes, and snippets.

@gabalese
Last active December 26, 2015 11:49
Show Gist options
  • Save gabalese/7146295 to your computer and use it in GitHub Desktop.
Save gabalese/7146295 to your computer and use it in GitHub Desktop.
Splits a HTML file into several smaller files, given a tag and a (optional) class name. Also fixes links in the same document.
#! /usr/bin/env python
try:
import lxml.etree as ET
except ImportError:
from xml.etree import ElementTree as ET
from StringIO import StringIO
import sys
pub_id = "-//W3C//DTD XHTML 1.1//EN"
sys_url = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"
def emptyXHTML():
doctype_string = '<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_url)
xml_header = '<?xml version="1.0"?>'
xhtml = xml_header + doctype_string + '<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>'
try:
ET.register_namespace('',"http://www.w3.org/1999/xhtml")
except ValueError:
pass
tree = ET.parse(StringIO(xhtml))
return tree.getroot()
def split(file, tag, cssclass=None):
doc = ET.parse(file).getroot()
head = doc[0]
body = doc[1]
new_doc = emptyXHTML()
new_doc.insert(0,head)
for i in body:
if i.tag == "{http://www.w3.org/1999/xhtml}"+tag and (i.get("class", None) == cssclass if cssclass is not None else i.get("class",None)):
if len(new_doc[1]) != 0:
yield new_doc
new_doc = emptyXHTML()
new_doc.insert(0,head)
new_doc[1].append(i)
continue
else:
new_doc[1].append(i)
else:
yield new_doc
def foonotesfix(filelist):
"""
Could it be refactored in a truly functional way?
"""
for file_1 in filelist:
htmltree = ET.parse(file_1)
anchors = htmltree.findall('.//{http://www.w3.org/1999/xhtml}a')
if anchors:
for i in anchors:
target = i.get("href")[1:]
for file_2 in filelist:
tree = ET.parse(file_2).getroot()
x = tree.findall('.//*[@id="%s"]' % target)
if x:
y = x[0].get("href")
j = i.get("href")
i.set("href","%s%s" % (file_2,j))
break
htmltree.write(file_1, xml_declaration=True, method="xml", encoding="utf-8")
if __name__ == '__main__':
if len(sys.argv) < 3:
print "USAGE: python html_splitter.py <tag> [<class>]"
sys.exit(1)
docs = split(*sys.argv[1:])
filelist = []
for k,i in enumerate(docs):
filename = "%s_%05d.html" % (sys.argv[1], k)
filelist.append(filename)
ET.ElementTree(i).write(filename, xml_declaration=True, method="xml", encoding="utf-8")
else:
foonotesfix(filelist)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment