gabalese · December 26, 2015 11:49
diff --git a/html_splitter.py b/html_splitter.py
 #! /usr/bin/env python

 try:
    import lxml.etree as ET
 except ImportError:
    from xml.etree import ElementTree as ET
 from StringIO import StringIO
 import sys

 pub_id  = "-//W3C//DTD XHTML 1.1//EN"
 sys_url = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"

 def emptyXHTML():
    doctype_string = '<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_url)
    xml_header = '<?xml version="1.0"?>'
    xhtml = xml_header + doctype_string + '<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>'
    try:
        ET.register_namespace('',"http://www.w3.org/1999/xhtml")
    except ValueError:
        pass
    tree = ET.parse(StringIO(xhtml))
    return tree.getroot()
    
 def split(file, tag, cssclass=None):
    doc = ET.parse(file).getroot()
    head = doc[0]
    body = doc[1]
    new_doc = emptyXHTML()
    new_doc.insert(0,head)
    for i in body:
        if i.tag == "{http://www.w3.org/1999/xhtml}"+tag and (i.get("class", None) == cssclass if cssclass is not None else i.get("class",None)):
            if len(new_doc[1]) != 0:
                yield new_doc
            new_doc = emptyXHTML()
            new_doc.insert(0,head)
            new_doc[1].append(i)
            continue
        else:
            new_doc[1].append(i)
    else:
        yield new_doc

 def foonotesfix(filelist):
    """
    Could it be refactored in a truly functional way?
    """
    for file_1 in filelist:
        htmltree = ET.parse(file_1)
        anchors = htmltree.findall('.//{http://www.w3.org/1999/xhtml}a')
        if anchors:
            for i in anchors:
                target = i.get("href")[1:]
                for file_2 in filelist:
                    tree = ET.parse(file_2).getroot()
                    x = tree.findall('.//*[@id="%s"]' % target)
                    if x:
                        y = x[0].get("href")
                        j = i.get("href")
                        i.set("href","%s%s" % (file_2,j))
                        break
            htmltree.write(file_1, xml_declaration=True, method="xml", encoding="utf-8")

 if __name__ == '__main__':
    if len(sys.argv) < 3:
        print "USAGE: python html_splitter.py <tag> [<class>]"
        sys.exit(1)
    docs = split(*sys.argv[1:])
    filelist = []
    for k,i in enumerate(docs):
        filename = "%s_%05d.html" % (sys.argv[1], k)
        filelist.append(filename)
        ET.ElementTree(i).write(filename, xml_declaration=True, method="xml", encoding="utf-8")
    else:
        foonotesfix(filelist)
	#! /usr/bin/env python

	try:
	import lxml.etree as ET
	except ImportError:
	from xml.etree import ElementTree as ET
	from StringIO import StringIO
	import sys

	pub_id = "-//W3C//DTD XHTML 1.1//EN"
	sys_url = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"

	def emptyXHTML():
	doctype_string = '<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_url)
	xml_header = '<?xml version="1.0"?>'
	xhtml = xml_header + doctype_string + '<html xmlns="http://www.w3.org/1999/xhtml"><body></body></html>'
	try:
	ET.register_namespace('',"http://www.w3.org/1999/xhtml")
	except ValueError:
	pass
	tree = ET.parse(StringIO(xhtml))
	return tree.getroot()

	def split(file, tag, cssclass=None):
	doc = ET.parse(file).getroot()
	head = doc[0]
	body = doc[1]
	new_doc = emptyXHTML()
	new_doc.insert(0,head)
	for i in body:
	if i.tag == "{http://www.w3.org/1999/xhtml}"+tag and (i.get("class", None) == cssclass if cssclass is not None else i.get("class",None)):
	if len(new_doc[1]) != 0:
	yield new_doc
	new_doc = emptyXHTML()
	new_doc.insert(0,head)
	new_doc[1].append(i)
	continue
	else:
	new_doc[1].append(i)
	else:
	yield new_doc

	def foonotesfix(filelist):
	"""
	Could it be refactored in a truly functional way?
	"""
	for file_1 in filelist:
	htmltree = ET.parse(file_1)
	anchors = htmltree.findall('.//{http://www.w3.org/1999/xhtml}a')
	if anchors:
	for i in anchors:
	target = i.get("href")[1:]
	for file_2 in filelist:
	tree = ET.parse(file_2).getroot()
	x = tree.findall('.//*[@id="%s"]' % target)
	if x:
	y = x[0].get("href")
	j = i.get("href")
	i.set("href","%s%s" % (file_2,j))
	break
	htmltree.write(file_1, xml_declaration=True, method="xml", encoding="utf-8")

	if __name__ == '__main__':
	if len(sys.argv) < 3:
	print "USAGE: python html_splitter.py <tag> [<class>]"
	sys.exit(1)
	docs = split(*sys.argv[1:])
	filelist = []
	for k,i in enumerate(docs):
	filename = "%s_%05d.html" % (sys.argv[1], k)
	filelist.append(filename)
	ET.ElementTree(i).write(filename, xml_declaration=True, method="xml", encoding="utf-8")
	else:
	foonotesfix(filelist)