Created
March 25, 2014 05:10
-
-
Save dunhamsteve/9755584 to your computer and use it in GitHub Desktop.
This is a rough script to rewrite footnotes in an epub file to match what iBooks expects for pop-up footnotes. I'm posting it in case it is useful to someone.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Written and placed in the public domain by Steve Dunham | |
# Tries to find footnotes in an epub and transform them into iBooks/epub3 popup footnotes. | |
# | |
# This works with a couple of Terry Pratchett books, it will probably need tweaking for other books. | |
# This script tries to: | |
# | |
# - find the footnote links | |
# - ensure that there is nothing but text inside the A tag | |
# - add epub:type=noteref to the A tag | |
# - ensure that the footnote is in the same html file | |
# - make the footnote an ASIDE element with epub:type=footnote | |
# | |
# I also change the id of the document so iBooks doesn't get confused. | |
# | |
# TODO: | |
# - Remove backlinks | |
# - Figure out what to do about nested footnotes. (e.g. Colour of Magic) | |
import lxml.etree as ET | |
import zipfile | |
import os | |
from urlparse import urljoin | |
from collections import namedtuple | |
Document = namedtuple('Document', "fn doc item ref") | |
def text(n): | |
return ' '.join(n.xpath('.//text()')) | |
class Epub(object): | |
def __init__(self, infile): | |
self.zf = zf = zipfile.ZipFile(infile) | |
for info in zf.infolist(): | |
print info.filename | |
container = ET.parse(zf.open('META-INF/container.xml')) | |
ET.register_namespace('epub','http://www.idpf.org/2007/ops') | |
self.content_fn = content_fn = container.find('//{*}rootfile').get('full-path') | |
self.content = content = ET.parse(zf.open(self.content_fn)) | |
# Fixes issues with XHTML entities, like ’ | |
p = ET.XMLParser(resolve_entities=True,load_dtd=True,ns_clean=True) | |
self.docs = {} | |
self.spine = [] | |
for ir in content.findall('//{*}spine/{*}itemref'): | |
ref = ir.get('idref') | |
item = content.find('//*[@id="%s"]' % ref) | |
href = item.get('href') | |
fn = urljoin(content_fn, href) | |
doc = ET.parse(zf.open(fn), p) | |
d = Document(fn, doc, item, ref) | |
self.docs[fn] = d | |
self.spine.append(d) | |
idid = self.content.getroot().get('unique-identifier') | |
idel ,= self.content.xpath('//*[@id="%s"]' % idid) | |
# For now I'm tweaking the identifier so ibooks doesn't get confused. | |
idel.text += "+fn" | |
# REVIEW - this works for one document, check the rest. | |
for d in self.spine: | |
print '***', d.fn | |
# This dance forces the epub namespace declaration onto the root element. | |
d.doc.getroot().set('{http://www.idpf.org/2007/ops}foo',' ') | |
d.doc.getroot().attrib.clear() | |
for a in d.doc.findall('//{*}a'): | |
if a.get('href'): | |
cc = a.getchildren() | |
if cc and cc[0].tag.endswith('sup'): | |
# swap tags - iBooks footnote must have textual content | |
cc[0].tag = '{http://www.w3.org/1999/xhtml}a' | |
a.tag = '{http://www.w3.org/1999/xhtml}sup' | |
cc[0].attrib.update(a.attrib) | |
a.attrib.clear() | |
a = cc[0] | |
fn, id = urljoin(d.fn, a.get('href')).split('#') | |
fdoc = self.docs[fn] | |
if self.spine.index(d) <= self.spine.index(fdoc): | |
print fn, id, fdoc | |
print 'CC',a.get('href'), ET.tostring(a, with_tail=False) | |
dest = fdoc.doc.find('//*[@id="%s"]' % id) | |
if text(dest).strip(): | |
# ok, we got one, process... | |
a.set('{http://www.idpf.org/2007/ops}type','noteref') | |
a.set('href','#%s' % id) | |
# we might want this to be aside wrapped around 'dest', lifting the id | |
dest.set('{http://www.idpf.org/2007/ops}type','footnote') | |
dest.tag = '{http://www.w3.org/1999/xhtml}aside' | |
d.doc.find('{*}body').append(dest) | |
def writefile(self, fn): | |
out = zipfile.ZipFile(fn, 'w') | |
for info in self.zf.infolist(): | |
if info.filename.endswith('/'): | |
continue | |
print info.filename, info.compress_type | |
if info.filename in self.docs: | |
data = ET.tostring(self.docs[info.filename].doc) | |
elif info.filename == self.content_fn: | |
data = ET.tostring(self.content) | |
else: | |
data = self.zf.read(info.filename) | |
out.writestr(info, data) | |
if __name__ == '__main__': | |
import sys | |
ee = Epub(sys.argv[1]) | |
ee.writefile(sys.argv[2]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment