dunhamsteve · March 25, 2014 05:10
diff --git a/fixfootnotes.py b/fixfootnotes.py
 #!/usr/bin/python

 # Written and placed in the public domain by Steve Dunham

 # Tries to find footnotes in an epub and transform them into iBooks/epub3 popup footnotes.
 #
 # This works with a couple of Terry Pratchett books, it will probably need tweaking for other books.
 # This script tries to:
 #
 # - find the footnote links
 # - ensure that there is nothing but text inside the A tag
 # - add epub:type=noteref to the A tag
 # - ensure that the footnote is in the same html file
 # - make the footnote an ASIDE element with epub:type=footnote
 #
 # I also change the id of the document so iBooks doesn't get confused.
 #
 # TODO:
 # - Remove backlinks
 # - Figure out what to do about nested footnotes. (e.g. Colour of Magic)

 import lxml.etree as ET
 import zipfile
 import os
 from urlparse import urljoin
 from collections import namedtuple

 Document = namedtuple('Document', "fn doc item ref")


 def text(n):
  return ' '.join(n.xpath('.//text()'))

 class Epub(object):
  def __init__(self, infile):
    self.zf = zf = zipfile.ZipFile(infile)
    for info in  zf.infolist():
      print info.filename
  
    container = ET.parse(zf.open('META-INF/container.xml'))
    
    ET.register_namespace('epub','http://www.idpf.org/2007/ops')
    
    self.content_fn = content_fn = container.find('//{*}rootfile').get('full-path')
    
    self.content = content = ET.parse(zf.open(self.content_fn))
    
    # Fixes issues with XHTML entities, like &rsquo;
    p = ET.XMLParser(resolve_entities=True,load_dtd=True,ns_clean=True)
    
    self.docs = {}
    self.spine = []
    
    for ir in content.findall('//{*}spine/{*}itemref'):
      ref = ir.get('idref')
      item = content.find('//*[@id="%s"]' % ref)
      href = item.get('href')
      fn = urljoin(content_fn, href)
            
      doc = ET.parse(zf.open(fn), p)
      
      d = Document(fn, doc, item, ref)
      self.docs[fn] = d
      self.spine.append(d)
    
    idid = self.content.getroot().get('unique-identifier')
    idel ,= self.content.xpath('//*[@id="%s"]' % idid)

    # For now I'm tweaking the identifier so ibooks doesn't get confused.
    idel.text += "+fn"
    
    # REVIEW - this works for one document, check the rest.
    for d in self.spine:
      print '***', d.fn
      
      # This dance forces the epub namespace declaration onto the root element.
      d.doc.getroot().set('{http://www.idpf.org/2007/ops}foo',' ')
      d.doc.getroot().attrib.clear()
      
      for a in d.doc.findall('//{*}a'):
        if a.get('href'):
          cc = a.getchildren()
          if cc and cc[0].tag.endswith('sup'):
            # swap tags - iBooks footnote must have textual content
            cc[0].tag = '{http://www.w3.org/1999/xhtml}a'
            a.tag = '{http://www.w3.org/1999/xhtml}sup'
            cc[0].attrib.update(a.attrib)
            a.attrib.clear()
            a = cc[0]
            
            fn, id = urljoin(d.fn, a.get('href')).split('#')
            fdoc = self.docs[fn]
            if self.spine.index(d) <= self.spine.index(fdoc):
              print fn, id, fdoc
              print 'CC',a.get('href'), ET.tostring(a, with_tail=False)
              dest = fdoc.doc.find('//*[@id="%s"]' % id)
              
              if text(dest).strip():
                # ok, we got one, process...
                a.set('{http://www.idpf.org/2007/ops}type','noteref')
                a.set('href','#%s' % id)
                
                # we might want this to be aside wrapped around 'dest', lifting the id
                dest.set('{http://www.idpf.org/2007/ops}type','footnote')
                dest.tag = '{http://www.w3.org/1999/xhtml}aside'
                
                d.doc.find('{*}body').append(dest)
              
  def writefile(self, fn):
    out = zipfile.ZipFile(fn, 'w')
    for info in self.zf.infolist():
      if info.filename.endswith('/'):
        continue
      
      print info.filename, info.compress_type
      if info.filename in self.docs:
        data = ET.tostring(self.docs[info.filename].doc)
      elif info.filename == self.content_fn:
        data = ET.tostring(self.content)
      else:
        data = self.zf.read(info.filename)
      out.writestr(info, data)
      
    

 if __name__ == '__main__':
  import sys
  
  ee = Epub(sys.argv[1])
  ee.writefile(sys.argv[2])
	#!/usr/bin/python

	# Written and placed in the public domain by Steve Dunham

	# Tries to find footnotes in an epub and transform them into iBooks/epub3 popup footnotes.
	#
	# This works with a couple of Terry Pratchett books, it will probably need tweaking for other books.
	# This script tries to:
	#
	# - find the footnote links
	# - ensure that there is nothing but text inside the A tag
	# - add epub:type=noteref to the A tag
	# - ensure that the footnote is in the same html file
	# - make the footnote an ASIDE element with epub:type=footnote
	#
	# I also change the id of the document so iBooks doesn't get confused.
	#
	# TODO:
	# - Remove backlinks
	# - Figure out what to do about nested footnotes. (e.g. Colour of Magic)

	import lxml.etree as ET
	import zipfile
	import os
	from urlparse import urljoin
	from collections import namedtuple

	Document = namedtuple('Document', "fn doc item ref")


	def text(n):
	return ' '.join(n.xpath('.//text()'))

	class Epub(object):
	def __init__(self, infile):
	self.zf = zf = zipfile.ZipFile(infile)
	for info in zf.infolist():
	print info.filename

	container = ET.parse(zf.open('META-INF/container.xml'))

	ET.register_namespace('epub','http://www.idpf.org/2007/ops')

	self.content_fn = content_fn = container.find('//{*}rootfile').get('full-path')

	self.content = content = ET.parse(zf.open(self.content_fn))

	# Fixes issues with XHTML entities, like ’
	p = ET.XMLParser(resolve_entities=True,load_dtd=True,ns_clean=True)

	self.docs = {}
	self.spine = []

	for ir in content.findall('//{}spine/{}itemref'):
	ref = ir.get('idref')
	item = content.find('//*[@id="%s"]' % ref)
	href = item.get('href')
	fn = urljoin(content_fn, href)

	doc = ET.parse(zf.open(fn), p)

	d = Document(fn, doc, item, ref)
	self.docs[fn] = d
	self.spine.append(d)

	idid = self.content.getroot().get('unique-identifier')
	idel ,= self.content.xpath('//*[@id="%s"]' % idid)

	# For now I'm tweaking the identifier so ibooks doesn't get confused.
	idel.text += "+fn"

	# REVIEW - this works for one document, check the rest.
	for d in self.spine:
	print '***', d.fn

	# This dance forces the epub namespace declaration onto the root element.
	d.doc.getroot().set('{http://www.idpf.org/2007/ops}foo',' ')
	d.doc.getroot().attrib.clear()

	for a in d.doc.findall('//{*}a'):
	if a.get('href'):
	cc = a.getchildren()
	if cc and cc[0].tag.endswith('sup'):
	# swap tags - iBooks footnote must have textual content
	cc[0].tag = '{http://www.w3.org/1999/xhtml}a'
	a.tag = '{http://www.w3.org/1999/xhtml}sup'
	cc[0].attrib.update(a.attrib)
	a.attrib.clear()
	a = cc[0]

	fn, id = urljoin(d.fn, a.get('href')).split('#')
	fdoc = self.docs[fn]
	if self.spine.index(d) <= self.spine.index(fdoc):
	print fn, id, fdoc
	print 'CC',a.get('href'), ET.tostring(a, with_tail=False)
	dest = fdoc.doc.find('//*[@id="%s"]' % id)

	if text(dest).strip():
	# ok, we got one, process...
	a.set('{http://www.idpf.org/2007/ops}type','noteref')
	a.set('href','#%s' % id)

	# we might want this to be aside wrapped around 'dest', lifting the id
	dest.set('{http://www.idpf.org/2007/ops}type','footnote')
	dest.tag = '{http://www.w3.org/1999/xhtml}aside'

	d.doc.find('{*}body').append(dest)

	def writefile(self, fn):
	out = zipfile.ZipFile(fn, 'w')
	for info in self.zf.infolist():
	if info.filename.endswith('/'):
	continue

	print info.filename, info.compress_type
	if info.filename in self.docs:
	data = ET.tostring(self.docs[info.filename].doc)
	elif info.filename == self.content_fn:
	data = ET.tostring(self.content)
	else:
	data = self.zf.read(info.filename)
	out.writestr(info, data)



	if __name__ == '__main__':
	import sys

	ee = Epub(sys.argv[1])
	ee.writefile(sys.argv[2])
No results found