Skip to content

Instantly share code, notes, and snippets.

@dunhamsteve
Created March 25, 2014 05:10
Show Gist options
  • Save dunhamsteve/9755584 to your computer and use it in GitHub Desktop.
Save dunhamsteve/9755584 to your computer and use it in GitHub Desktop.
This is a rough script to rewrite footnotes in an epub file to match what iBooks expects for pop-up footnotes. I'm posting it in case it is useful to someone.
#!/usr/bin/python
# Written and placed in the public domain by Steve Dunham
# Tries to find footnotes in an epub and transform them into iBooks/epub3 popup footnotes.
#
# This works with a couple of Terry Pratchett books, it will probably need tweaking for other books.
# This script tries to:
#
# - find the footnote links
# - ensure that there is nothing but text inside the A tag
# - add epub:type=noteref to the A tag
# - ensure that the footnote is in the same html file
# - make the footnote an ASIDE element with epub:type=footnote
#
# I also change the id of the document so iBooks doesn't get confused.
#
# TODO:
# - Remove backlinks
# - Figure out what to do about nested footnotes. (e.g. Colour of Magic)
import lxml.etree as ET
import zipfile
import os
from urlparse import urljoin
from collections import namedtuple
Document = namedtuple('Document', "fn doc item ref")
def text(n):
return ' '.join(n.xpath('.//text()'))
class Epub(object):
def __init__(self, infile):
self.zf = zf = zipfile.ZipFile(infile)
for info in zf.infolist():
print info.filename
container = ET.parse(zf.open('META-INF/container.xml'))
ET.register_namespace('epub','http://www.idpf.org/2007/ops')
self.content_fn = content_fn = container.find('//{*}rootfile').get('full-path')
self.content = content = ET.parse(zf.open(self.content_fn))
# Fixes issues with XHTML entities, like ’
p = ET.XMLParser(resolve_entities=True,load_dtd=True,ns_clean=True)
self.docs = {}
self.spine = []
for ir in content.findall('//{*}spine/{*}itemref'):
ref = ir.get('idref')
item = content.find('//*[@id="%s"]' % ref)
href = item.get('href')
fn = urljoin(content_fn, href)
doc = ET.parse(zf.open(fn), p)
d = Document(fn, doc, item, ref)
self.docs[fn] = d
self.spine.append(d)
idid = self.content.getroot().get('unique-identifier')
idel ,= self.content.xpath('//*[@id="%s"]' % idid)
# For now I'm tweaking the identifier so ibooks doesn't get confused.
idel.text += "+fn"
# REVIEW - this works for one document, check the rest.
for d in self.spine:
print '***', d.fn
# This dance forces the epub namespace declaration onto the root element.
d.doc.getroot().set('{http://www.idpf.org/2007/ops}foo',' ')
d.doc.getroot().attrib.clear()
for a in d.doc.findall('//{*}a'):
if a.get('href'):
cc = a.getchildren()
if cc and cc[0].tag.endswith('sup'):
# swap tags - iBooks footnote must have textual content
cc[0].tag = '{http://www.w3.org/1999/xhtml}a'
a.tag = '{http://www.w3.org/1999/xhtml}sup'
cc[0].attrib.update(a.attrib)
a.attrib.clear()
a = cc[0]
fn, id = urljoin(d.fn, a.get('href')).split('#')
fdoc = self.docs[fn]
if self.spine.index(d) <= self.spine.index(fdoc):
print fn, id, fdoc
print 'CC',a.get('href'), ET.tostring(a, with_tail=False)
dest = fdoc.doc.find('//*[@id="%s"]' % id)
if text(dest).strip():
# ok, we got one, process...
a.set('{http://www.idpf.org/2007/ops}type','noteref')
a.set('href','#%s' % id)
# we might want this to be aside wrapped around 'dest', lifting the id
dest.set('{http://www.idpf.org/2007/ops}type','footnote')
dest.tag = '{http://www.w3.org/1999/xhtml}aside'
d.doc.find('{*}body').append(dest)
def writefile(self, fn):
out = zipfile.ZipFile(fn, 'w')
for info in self.zf.infolist():
if info.filename.endswith('/'):
continue
print info.filename, info.compress_type
if info.filename in self.docs:
data = ET.tostring(self.docs[info.filename].doc)
elif info.filename == self.content_fn:
data = ET.tostring(self.content)
else:
data = self.zf.read(info.filename)
out.writestr(info, data)
if __name__ == '__main__':
import sys
ee = Epub(sys.argv[1])
ee.writefile(sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment