Skip to content

Instantly share code, notes, and snippets.

@jrk
Created November 8, 2009 20:59
Show Gist options
  • Save jrk/229472 to your computer and use it in GitHub Desktop.
Save jrk/229472 to your computer and use it in GitHub Desktop.
Convert CHM files to the LFM ebook format.
from __future__ import with_statement
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import sys, os, re, shutil
from tempfile import mkdtemp
from mimetypes import guess_type as guess_mimetype
from htmlentitydefs import name2codepoint
from pprint import PrettyPrinter
from BeautifulSoup import BeautifulSoup
from chm.chm import CHMFile
from chm.chmlib import (
CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
chm_enumerate, chm_retrieve_object,
)
from calibre.ebooks.lrf import option_parser as lrf_parser
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator, Guide
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file
def option_parser():
parser = lrf_parser('Usage: %prog [options] mybook.chm')
parser.add_option(
'-d', '--output-dir', default='.',
help=_('Output directory. Defaults to current directory.'))
return parser
class CHMError(Exception):
pass
class CHMReader(CHMFile):
def __init__(self, input):
CHMFile.__init__(self)
if not self.LoadCHM(input):
raise CHMError("Unable to open CHM file '%s'"%(input,))
self._contents = None
self._playorder = 0
self._metadata = False
self._extracted = False
# we'll be creating two new files on top of the extracted stuff from
# the CHM -- OPF metadata and NCX table of contents. Let's put them in
# the same place as the '.hhc' file, which is the CHM TOC.
self.root, ext = os.path.splitext(self.topics.lstrip('/'))
self.opf_path = self.root + ".opf"
self.ncx_path = self.root + ".ncx"
def GetMetadata(self, basedir=os.getcwdu()):
'''Gets meta-data from the CHM file into an OPFCreator object.
Takes an optional 'basedir' argument, which is provided to the created
meta-data objects so that they can work out relative paths.'''
self.opf = OPFCreator(basedir, self.title)
self.opf.title_sort = self._title_sort()
# now, attempt to grab vaguely standard metadata from the "home" page.
home = BeautifulSoup(self.GetFile(self.home))
self._get_authors(home)
self._get_publisher(home)
self._get_isbn(home)
self._get_comments(home)
self._get_coverpath(home)
self.opf.create_manifest(map(lambda x: (x, guess_mimetype(x)[0]), self.Contents()))
tocsoup = BeautifulSoup(self.GetTopicsTree())
self.toc = self._parse_toc(tocsoup.body.ul, basedir)
# we are providing an ncx index too, so let's only put top-level
# TOC stuff in the spine, for brevity's sake...
self.opf.create_spine([item.href for item in self.toc if item.href])
self.opf.set_toc(self.toc)
self.opf.guide = self._create_guide(tocsoup, basedir)
self._metadata = True
def _title_sort(self):
prefixes = ('a ', 'the ')
ts = self.title
for prefix in prefixes:
if ts[0:len(prefix)].lower() == prefix:
ts = ts[len(prefix):len(ts)]+", "+ts[0:len(prefix)-1]
return ts
def _metadata_from_table(self, soup, searchfor):
td = soup.find('td', text=re.compile(searchfor, flags=re.I))
if td is None:
return None
td = td.parent
# there appears to be multiple ways of structuring the metadata
# on the home page. cue some nasty special-case hacks...
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
meta = self._detag(td.findNextSibling('td'))
return re.sub('^:', '', meta).strip()
else:
meta = self._detag(td)
return re.sub(r'^[^:]+:', '', meta).strip()
def _metadata_from_span(self, soup, searchfor):
span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
if span is None:
return None
# this metadata might need some cleaning up still :/
return span.renderContents().strip()
def _get_authors(self, soup):
aut = (self._metadata_from_span(soup, r'author')
or self._metadata_from_table(soup, r'^\s*by\s*:?\s+'))
if aut is None:
self.opf.authors = [u'Unknown']
self.opf.author_sort = u''
else:
aut = re.split(r'\s*(?:,|and)\s*',
re.sub(re.compile(r'^\s*by:?\s*', flags=re.I), '', aut))
self.opf.authors = aut
aut = aut[0].split()
# assume sorting by first named author's surname
# and further that surname == name.split()[-1]
self.opf.author_sort = aut[-1] + ', ' + ' '.join(aut[0:-1])
def _get_publisher(self, soup):
self.opf.publisher = (self._metadata_from_span(soup, 'imprint')
or self._metadata_from_table(soup, 'publisher'))
def _get_isbn(self, soup):
isbn = (self._metadata_from_span(soup, 'isbn')
or self._metadata_from_table(soup, 'isbn'))
self.opf.isbn = re.sub(re.compile(r'^\s*isbn\s*\:', flags=re.I), '', isbn).strip()
def _get_comments(self, soup):
date = (self._metadata_from_span(soup, 'cwdate')
or self._metadata_from_table(soup, 'pub date'))
pages = (self. _metadata_from_span(soup, 'pages')
or self._metadata_from_table(soup, 'pages'))
try:
# date span can have copyright symbols in it...
date = date.replace(u'\u00a9', '').strip()
# and pages often comes as '(\d+ pages)'
pages = re.search(r'\d+', pages).group(0)
self.opf.comments = u'Published %s, %s pages.' % (date, pages)
except AttributeError:
self.opf.comments = u''
def _get_coverpath(self, soup):
self.opf.cover = None
try:
self.opf.cover = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
except TypeError:
# meeehh, no handy alt-tag goodness, try some hackery
# the basic idea behind this is that in general, the cover image
# has a height:width ratio of ~1.25, whereas most of the nav
# buttons are decidedly less than that.
# what we do in this is work out that ratio, take 1.25 off it and
# save the absolute value when we sort by this value, the smallest
# one is most likely to be the cover image, hopefully.
r = {}
for img in soup('img'):
try:
r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
except KeyError:
# interestingly, occasionally the only image without height
# or width attrs is the cover...
r[0] = img['src']
l = r.keys()
l.sort()
self.opf.cover = r[l[0]]
# this link comes from the internal html, which is in a subdir
if self.opf.cover is not None:
self.opf.cover = self.root + "/" + self.opf.cover
def _create_guide(self, soup, basedir=os.getcwdu()):
guide = Guide()
guide.set_basedir(basedir)
titlepage = Guide.Reference(self.home.lstrip('/'), basedir)
titlepage.title = u'About this E-Book'
titlepage.type = u'title-page'
guide.append(titlepage)
# let's try and get useful guide things from our toc soup
# map the guide type attribute to name and search regex
map = {
'toc': [u'Table of Contents', '(?:table of )?contents?'],
'copyright-page': [u'Copyright', 'copyright'],
'dedication': [u'Dedication', 'dedication'],
'preface': [u'Preface', 'preface'],
'foreword': [u'Foreword', 'foreword'],
'acknowledgements': [u'Acknowledgements', 'acknowledgements'],
'bibliography': [u'Bibliography', 'bibliography'],
'index': [u'Index', 'index'],
'glossary': [u'Glossary', 'glossary'],
'colophon': [u'Colophon', 'colophon'],
'text': [u'Start of Content', 'chapter 1'],
}
for type, name in map.items():
obj = soup.find('param', {
'name': 'Name',
'value': re.compile(name[1], re.I)
})
if obj is None: continue
href = obj.parent.find('param', {'name': 'Local'})['value']
ref = Guide.Reference(href, basedir)
ref.title = name[0]
ref.type = type
guide.append(ref)
return guide
def _parse_toc(self, ul, basedir=os.getcwdu()):
toc = TOC(play_order=self._playorder, base_path=basedir)
self._playorder += 1
for li in ul('li', recursive=False):
href = li.object('param', {'name': 'Local'})[0]['value']
if href.count('#'):
href, frag = href.split('#')
else:
frag = None
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
toc.add_item(href, frag, name, play_order=self._playorder)
self._playorder += 1
if li.ul:
child = self._parse_toc(li.ul)
child.parent = toc
toc.append(child)
return toc
def _detag(self, tag):
str = ""
for elem in tag:
if hasattr(elem, "contents"):
str += self._detag(elem)
else:
str += self._deentity(elem)
return str
def _deentity(self, elem):
def replace_entity(m):
if m.group(1)=='#':
try:
return unichr(int(m.group(2)))
except ValueError:
return '&#%s;' % m.group(2)
try:
return unichr(name2codepoint[m.group(2)])
except KeyError:
return '&%s;' % m.group(2)
# rargh nbsp => \xa0, not a real space
return re.sub(r'\s+', ' ', re.sub(r'&(#?)([^;]+);', replace_entity, elem).replace(u'\u00a0', ' '))
def GetFile(self, path):
# have to have abs paths for ResolveObject, but Contents() deliberately
# makes them relative. So we don't have to worry, re-add the leading /.
if path[0] != '/':
path = '/' + path
res, ui = self.ResolveObject(path)
if res != CHM_RESOLVE_SUCCESS:
raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
size, data = self.RetrieveObject(ui)
if size == 0:
raise CHMError("'%s' is zero bytes in length!"%(path,))
return data
def ExtractFiles(self, output_dir=os.getcwdu()):
for path in self.Contents():
lpath = os.path.join(output_dir, path)
self._ensure_dir(lpath)
data = self.GetFile(path)
with open(lpath, 'wb') as f:
if guess_mimetype(path)[0] == ('text/html'):
data = self._reformat(data)
f.write(data)
self._extracted = True
def _reformat(self, data):
try:
html = BeautifulSoup(data)
except UnicodeEncodeError:
# hit some strange encoding problems...
print "Unable to parse html for cleaning, leaving it :("
return data
# nuke javascript...
[s.extract() for s in html('script')]
# remove forward and back nav bars from the top/bottom of each page
# cos they really fuck with the flow of things and generally waste space
# since we can't use [a,b] syntax to select arbitrary items from a list
# we'll have to do this manually...
t = html('table')
if t:
if (t[0].previousSibling is None
or t[0].previousSibling.previousSibling is None):
t[0].extract()
if (t[-1].nextSibling is None
or t[-1].nextSibling.nextSibling is None):
t[-1].extract()
# for some very odd reason each page's content appears to be in a table
# too. and this table has sub-tables for random asides... grr.
# some images seem to be broken in some chm's :/
for img in html('img'):
try:
# some are supposedly "relative"... lies.
while img['src'].startswith('../'): img['src'] = img['src'][3:]
# some have ";<junk>" at the end.
img['src'] = img['src'].split(';')[0]
except KeyError:
# and some don't even have a src= ?!
pass
# now give back some pretty html.
return html.prettify()
def Contents(self):
if self._contents is not None:
return self._contents
paths = []
def get_paths(chm, ui, ctx):
# skip directories
if ui.path[-1] != '/':
# and make paths relative
paths.append(ui.path.lstrip('/'))
chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
self._contents = paths
return self._contents
def _ensure_dir(self, path):
dir = os.path.dirname(path)
if not os.path.isdir(dir):
os.makedirs(dir)
def CreateMetafiles(self, output_dir=os.getcwdu()):
if not self._metadata:
self.GetMetadata(basedir=output_dir)
self._ensure_dir(output_dir)
opf_fd = open(os.path.join(output_dir, self.opf_path), 'wb')
ncx_fd = open(os.path.join(output_dir, self.ncx_path), 'wb')
self.opf.render(opf_fd, ncx_fd, self.ncx_path)
opf_fd.close()
ncx_fd.close()
def extract_content(self, output_dir=os.getcwdu()):
self.ExtractFiles(output_dir=output_dir)
self.CreateMetafiles(output_dir=output_dir)
def process_file(f, options, logger):
tdir = mkdtemp(prefix='chm2oeb_')
f = os.path.abspath(os.path.expanduser(f))
if not options.output:
ext = '.lrs' if options.lrs else '.lrf'
options.output = os.path.splitext(f)[0] + ext
rdr = CHMReader(f)
print "Extracting CHM to ", tdir
rdr.extract_content(tdir)
options.opf = os.path.join(tdir, rdr.opf_path)
try:
html_process_file(os.path.join(tdir, rdr.home.lstrip('/')), options)
finally:
try:
shutil.rmtree(tdir)
except:
print "Failed to delete tempdir ", tdir
def main(args=sys.argv, logger=None):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) != 2:
parser.print_help()
print
print "FAIL: provide a CHM file as an argument!"
return 1
process_file(args[1], options, logger)
# tdir = mkdtemp(prefix='chm2oeb_', dir='.')
# rdr = CHMReader(args[1])
# rdr.extract_content(tdir)
return 0
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment