Created
November 8, 2009 20:59
-
-
Save jrk/229472 to your computer and use it in GitHub Desktop.
Convert CHM files to the LFM ebook format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import with_statement | |
''' CHM File decoding support ''' | |
__license__ = 'GPL v3' | |
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \ | |
' and Alex Bramley <a.bramley at gmail.com>.' | |
import sys, os, re, shutil | |
from tempfile import mkdtemp | |
from mimetypes import guess_type as guess_mimetype | |
from htmlentitydefs import name2codepoint | |
from pprint import PrettyPrinter | |
from BeautifulSoup import BeautifulSoup | |
from chm.chm import CHMFile | |
from chm.chmlib import ( | |
CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL, | |
chm_enumerate, chm_retrieve_object, | |
) | |
from calibre.ebooks.lrf import option_parser as lrf_parser | |
from calibre.ebooks.metadata import MetaInformation | |
from calibre.ebooks.metadata.opf import OPFCreator, Guide | |
from calibre.ebooks.metadata.toc import TOC | |
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file | |
def option_parser(): | |
parser = lrf_parser('Usage: %prog [options] mybook.chm') | |
parser.add_option( | |
'-d', '--output-dir', default='.', | |
help=_('Output directory. Defaults to current directory.')) | |
return parser | |
class CHMError(Exception): | |
pass | |
class CHMReader(CHMFile): | |
def __init__(self, input): | |
CHMFile.__init__(self) | |
if not self.LoadCHM(input): | |
raise CHMError("Unable to open CHM file '%s'"%(input,)) | |
self._contents = None | |
self._playorder = 0 | |
self._metadata = False | |
self._extracted = False | |
# we'll be creating two new files on top of the extracted stuff from | |
# the CHM -- OPF metadata and NCX table of contents. Let's put them in | |
# the same place as the '.hhc' file, which is the CHM TOC. | |
self.root, ext = os.path.splitext(self.topics.lstrip('/')) | |
self.opf_path = self.root + ".opf" | |
self.ncx_path = self.root + ".ncx" | |
def GetMetadata(self, basedir=os.getcwdu()): | |
'''Gets meta-data from the CHM file into an OPFCreator object. | |
Takes an optional 'basedir' argument, which is provided to the created | |
meta-data objects so that they can work out relative paths.''' | |
self.opf = OPFCreator(basedir, self.title) | |
self.opf.title_sort = self._title_sort() | |
# now, attempt to grab vaguely standard metadata from the "home" page. | |
home = BeautifulSoup(self.GetFile(self.home)) | |
self._get_authors(home) | |
self._get_publisher(home) | |
self._get_isbn(home) | |
self._get_comments(home) | |
self._get_coverpath(home) | |
self.opf.create_manifest(map(lambda x: (x, guess_mimetype(x)[0]), self.Contents())) | |
tocsoup = BeautifulSoup(self.GetTopicsTree()) | |
self.toc = self._parse_toc(tocsoup.body.ul, basedir) | |
# we are providing an ncx index too, so let's only put top-level | |
# TOC stuff in the spine, for brevity's sake... | |
self.opf.create_spine([item.href for item in self.toc if item.href]) | |
self.opf.set_toc(self.toc) | |
self.opf.guide = self._create_guide(tocsoup, basedir) | |
self._metadata = True | |
def _title_sort(self): | |
prefixes = ('a ', 'the ') | |
ts = self.title | |
for prefix in prefixes: | |
if ts[0:len(prefix)].lower() == prefix: | |
ts = ts[len(prefix):len(ts)]+", "+ts[0:len(prefix)-1] | |
return ts | |
def _metadata_from_table(self, soup, searchfor): | |
td = soup.find('td', text=re.compile(searchfor, flags=re.I)) | |
if td is None: | |
return None | |
td = td.parent | |
# there appears to be multiple ways of structuring the metadata | |
# on the home page. cue some nasty special-case hacks... | |
if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I): | |
meta = self._detag(td.findNextSibling('td')) | |
return re.sub('^:', '', meta).strip() | |
else: | |
meta = self._detag(td) | |
return re.sub(r'^[^:]+:', '', meta).strip() | |
def _metadata_from_span(self, soup, searchfor): | |
span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)}) | |
if span is None: | |
return None | |
# this metadata might need some cleaning up still :/ | |
return span.renderContents().strip() | |
def _get_authors(self, soup): | |
aut = (self._metadata_from_span(soup, r'author') | |
or self._metadata_from_table(soup, r'^\s*by\s*:?\s+')) | |
if aut is None: | |
self.opf.authors = [u'Unknown'] | |
self.opf.author_sort = u'' | |
else: | |
aut = re.split(r'\s*(?:,|and)\s*', | |
re.sub(re.compile(r'^\s*by:?\s*', flags=re.I), '', aut)) | |
self.opf.authors = aut | |
aut = aut[0].split() | |
# assume sorting by first named author's surname | |
# and further that surname == name.split()[-1] | |
self.opf.author_sort = aut[-1] + ', ' + ' '.join(aut[0:-1]) | |
def _get_publisher(self, soup): | |
self.opf.publisher = (self._metadata_from_span(soup, 'imprint') | |
or self._metadata_from_table(soup, 'publisher')) | |
def _get_isbn(self, soup): | |
isbn = (self._metadata_from_span(soup, 'isbn') | |
or self._metadata_from_table(soup, 'isbn')) | |
self.opf.isbn = re.sub(re.compile(r'^\s*isbn\s*\:', flags=re.I), '', isbn).strip() | |
def _get_comments(self, soup): | |
date = (self._metadata_from_span(soup, 'cwdate') | |
or self._metadata_from_table(soup, 'pub date')) | |
pages = (self. _metadata_from_span(soup, 'pages') | |
or self._metadata_from_table(soup, 'pages')) | |
try: | |
# date span can have copyright symbols in it... | |
date = date.replace(u'\u00a9', '').strip() | |
# and pages often comes as '(\d+ pages)' | |
pages = re.search(r'\d+', pages).group(0) | |
self.opf.comments = u'Published %s, %s pages.' % (date, pages) | |
except AttributeError: | |
self.opf.comments = u'' | |
def _get_coverpath(self, soup): | |
self.opf.cover = None | |
try: | |
self.opf.cover = soup.find('img', alt=re.compile('cover', flags=re.I))['src'] | |
except TypeError: | |
# meeehh, no handy alt-tag goodness, try some hackery | |
# the basic idea behind this is that in general, the cover image | |
# has a height:width ratio of ~1.25, whereas most of the nav | |
# buttons are decidedly less than that. | |
# what we do in this is work out that ratio, take 1.25 off it and | |
# save the absolute value when we sort by this value, the smallest | |
# one is most likely to be the cover image, hopefully. | |
r = {} | |
for img in soup('img'): | |
try: | |
r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src'] | |
except KeyError: | |
# interestingly, occasionally the only image without height | |
# or width attrs is the cover... | |
r[0] = img['src'] | |
l = r.keys() | |
l.sort() | |
self.opf.cover = r[l[0]] | |
# this link comes from the internal html, which is in a subdir | |
if self.opf.cover is not None: | |
self.opf.cover = self.root + "/" + self.opf.cover | |
def _create_guide(self, soup, basedir=os.getcwdu()): | |
guide = Guide() | |
guide.set_basedir(basedir) | |
titlepage = Guide.Reference(self.home.lstrip('/'), basedir) | |
titlepage.title = u'About this E-Book' | |
titlepage.type = u'title-page' | |
guide.append(titlepage) | |
# let's try and get useful guide things from our toc soup | |
# map the guide type attribute to name and search regex | |
map = { | |
'toc': [u'Table of Contents', '(?:table of )?contents?'], | |
'copyright-page': [u'Copyright', 'copyright'], | |
'dedication': [u'Dedication', 'dedication'], | |
'preface': [u'Preface', 'preface'], | |
'foreword': [u'Foreword', 'foreword'], | |
'acknowledgements': [u'Acknowledgements', 'acknowledgements'], | |
'bibliography': [u'Bibliography', 'bibliography'], | |
'index': [u'Index', 'index'], | |
'glossary': [u'Glossary', 'glossary'], | |
'colophon': [u'Colophon', 'colophon'], | |
'text': [u'Start of Content', 'chapter 1'], | |
} | |
for type, name in map.items(): | |
obj = soup.find('param', { | |
'name': 'Name', | |
'value': re.compile(name[1], re.I) | |
}) | |
if obj is None: continue | |
href = obj.parent.find('param', {'name': 'Local'})['value'] | |
ref = Guide.Reference(href, basedir) | |
ref.title = name[0] | |
ref.type = type | |
guide.append(ref) | |
return guide | |
def _parse_toc(self, ul, basedir=os.getcwdu()): | |
toc = TOC(play_order=self._playorder, base_path=basedir) | |
self._playorder += 1 | |
for li in ul('li', recursive=False): | |
href = li.object('param', {'name': 'Local'})[0]['value'] | |
if href.count('#'): | |
href, frag = href.split('#') | |
else: | |
frag = None | |
name = self._deentity(li.object('param', {'name': 'Name'})[0]['value']) | |
toc.add_item(href, frag, name, play_order=self._playorder) | |
self._playorder += 1 | |
if li.ul: | |
child = self._parse_toc(li.ul) | |
child.parent = toc | |
toc.append(child) | |
return toc | |
def _detag(self, tag): | |
str = "" | |
for elem in tag: | |
if hasattr(elem, "contents"): | |
str += self._detag(elem) | |
else: | |
str += self._deentity(elem) | |
return str | |
def _deentity(self, elem): | |
def replace_entity(m): | |
if m.group(1)=='#': | |
try: | |
return unichr(int(m.group(2))) | |
except ValueError: | |
return '&#%s;' % m.group(2) | |
try: | |
return unichr(name2codepoint[m.group(2)]) | |
except KeyError: | |
return '&%s;' % m.group(2) | |
# rargh nbsp => \xa0, not a real space | |
return re.sub(r'\s+', ' ', re.sub(r'&(#?)([^;]+);', replace_entity, elem).replace(u'\u00a0', ' ')) | |
def GetFile(self, path): | |
# have to have abs paths for ResolveObject, but Contents() deliberately | |
# makes them relative. So we don't have to worry, re-add the leading /. | |
if path[0] != '/': | |
path = '/' + path | |
res, ui = self.ResolveObject(path) | |
if res != CHM_RESOLVE_SUCCESS: | |
raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename)) | |
size, data = self.RetrieveObject(ui) | |
if size == 0: | |
raise CHMError("'%s' is zero bytes in length!"%(path,)) | |
return data | |
def ExtractFiles(self, output_dir=os.getcwdu()): | |
for path in self.Contents(): | |
lpath = os.path.join(output_dir, path) | |
self._ensure_dir(lpath) | |
data = self.GetFile(path) | |
with open(lpath, 'wb') as f: | |
if guess_mimetype(path)[0] == ('text/html'): | |
data = self._reformat(data) | |
f.write(data) | |
self._extracted = True | |
def _reformat(self, data): | |
try: | |
html = BeautifulSoup(data) | |
except UnicodeEncodeError: | |
# hit some strange encoding problems... | |
print "Unable to parse html for cleaning, leaving it :(" | |
return data | |
# nuke javascript... | |
[s.extract() for s in html('script')] | |
# remove forward and back nav bars from the top/bottom of each page | |
# cos they really fuck with the flow of things and generally waste space | |
# since we can't use [a,b] syntax to select arbitrary items from a list | |
# we'll have to do this manually... | |
t = html('table') | |
if t: | |
if (t[0].previousSibling is None | |
or t[0].previousSibling.previousSibling is None): | |
t[0].extract() | |
if (t[-1].nextSibling is None | |
or t[-1].nextSibling.nextSibling is None): | |
t[-1].extract() | |
# for some very odd reason each page's content appears to be in a table | |
# too. and this table has sub-tables for random asides... grr. | |
# some images seem to be broken in some chm's :/ | |
for img in html('img'): | |
try: | |
# some are supposedly "relative"... lies. | |
while img['src'].startswith('../'): img['src'] = img['src'][3:] | |
# some have ";<junk>" at the end. | |
img['src'] = img['src'].split(';')[0] | |
except KeyError: | |
# and some don't even have a src= ?! | |
pass | |
# now give back some pretty html. | |
return html.prettify() | |
def Contents(self): | |
if self._contents is not None: | |
return self._contents | |
paths = [] | |
def get_paths(chm, ui, ctx): | |
# skip directories | |
if ui.path[-1] != '/': | |
# and make paths relative | |
paths.append(ui.path.lstrip('/')) | |
chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None) | |
self._contents = paths | |
return self._contents | |
def _ensure_dir(self, path): | |
dir = os.path.dirname(path) | |
if not os.path.isdir(dir): | |
os.makedirs(dir) | |
def CreateMetafiles(self, output_dir=os.getcwdu()): | |
if not self._metadata: | |
self.GetMetadata(basedir=output_dir) | |
self._ensure_dir(output_dir) | |
opf_fd = open(os.path.join(output_dir, self.opf_path), 'wb') | |
ncx_fd = open(os.path.join(output_dir, self.ncx_path), 'wb') | |
self.opf.render(opf_fd, ncx_fd, self.ncx_path) | |
opf_fd.close() | |
ncx_fd.close() | |
def extract_content(self, output_dir=os.getcwdu()): | |
self.ExtractFiles(output_dir=output_dir) | |
self.CreateMetafiles(output_dir=output_dir) | |
def process_file(f, options, logger): | |
tdir = mkdtemp(prefix='chm2oeb_') | |
f = os.path.abspath(os.path.expanduser(f)) | |
if not options.output: | |
ext = '.lrs' if options.lrs else '.lrf' | |
options.output = os.path.splitext(f)[0] + ext | |
rdr = CHMReader(f) | |
print "Extracting CHM to ", tdir | |
rdr.extract_content(tdir) | |
options.opf = os.path.join(tdir, rdr.opf_path) | |
try: | |
html_process_file(os.path.join(tdir, rdr.home.lstrip('/')), options) | |
finally: | |
try: | |
shutil.rmtree(tdir) | |
except: | |
print "Failed to delete tempdir ", tdir | |
def main(args=sys.argv, logger=None): | |
parser = option_parser() | |
options, args = parser.parse_args(args) | |
if len(args) != 2: | |
parser.print_help() | |
print "FAIL: provide a CHM file as an argument!" | |
return 1 | |
process_file(args[1], options, logger) | |
# tdir = mkdtemp(prefix='chm2oeb_', dir='.') | |
# rdr = CHMReader(args[1]) | |
# rdr.extract_content(tdir) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment