Created
January 21, 2013 13:43
-
-
Save jamesoutterside/4586153 to your computer and use it in GitHub Desktop.
Experimental code to look at an epub file and return metadata and a list of chapters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zipfile | |
def get_epub_info(fname): | |
zip = zipfile.ZipFile(fname) | |
# find the contents metafile | |
txt = zip.read('META-INF/container.xml') | |
tree = etree.fromstring(txt) | |
cfname = tree.xpath('n:rootfiles/n:rootfile/@full-path',namespaces=NS)[0] | |
# grab the metadata block from the contents metafile | |
cf = zip.read(cfname) | |
tree = etree.fromstring(cf) | |
metadata_p = tree.xpath('/pkg:package/pkg:metadata',namespaces=NS)[0] | |
# grab mainfest and items within | |
manifest_p = tree.xpath('/pkg:package/pkg:manifest',namespaces=NS)[0] | |
items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS) | |
# try for ncx file, used to build chapter list with acutal names | |
try: | |
ncx_name = tree.xpath('/pkg:package/pkg:manifest/pkg:item[@id ="ncx"]/@href', namespaces=NS)[0] | |
ncx_txt = zip.read(ncx_name) | |
ncx_tree = etree.fromstring(ncx_txt) | |
items = ncx_tree.xpath('ncx:navMap/ncx:navPoint', namespaces=NS) | |
chapters = [] | |
for item in items: | |
title = item.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text | |
content = item.xpath('ncx:content/@src', namespaces=NS)[0] | |
chapters.append([content,title]) | |
children_items = item.xpath('ncx:navPoint', namespaces=NS) | |
for ci in children_items: | |
ci_title = ci.xpath('ncx:navLabel/ncx:text', namespaces=NS)[0].text | |
ci_content = ci.xpath('ncx:content/@src', namespaces=NS)[0] | |
chapters.append([ci_content,ci_title]) | |
except Exception as e: | |
items = manifest_p.xpath('/pkg:package/pkg:manifest/pkg:item[@media-type="application/xhtml+xml"]',namespaces=NS) | |
chapters = [[i.attrib['href'],i.attrib['href']] for i in items] | |
# repackage the metadata | |
res = {} | |
for s in ['title','language','creator','date','identifier', 'rights', 'publisher']: | |
try: | |
res[s] = metadata_p.xpath('dc:%s/text()'%(s),namespaces=NS)[0] | |
except: | |
pass | |
#raise warning or exception here | |
return res, chapters | |
def get_epub_images(file_name_and_path): | |
zip = zipfile.ZipFile(file_name_and_path) | |
extensions = {".jpg", ".png", ".gif"} | |
images = [file for file in zip.namelist() if os.path.splitext(file)[1] in extensions] | |
for n in images: | |
zip.extract(n, 'extract from path') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment