jrk · November 8, 2009 20:59
diff --git a/chm2lrf.py b/chm2lrf.py
 from __future__ import with_statement
 ''' CHM File decoding support '''
 __license__ = 'GPL v3'
 __copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
                 ' and Alex Bramley <a.bramley at gmail.com>.'

 import sys, os, re, shutil
 from tempfile import mkdtemp
 from mimetypes import guess_type as guess_mimetype
 from htmlentitydefs import name2codepoint
 from pprint import PrettyPrinter

 from BeautifulSoup import BeautifulSoup
 from chm.chm import CHMFile
 from chm.chmlib import (
  CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
  chm_enumerate, chm_retrieve_object,
 )

 from calibre.ebooks.lrf import option_parser as lrf_parser
 from calibre.ebooks.metadata import MetaInformation
 from calibre.ebooks.metadata.opf import OPFCreator, Guide
 from calibre.ebooks.metadata.toc import TOC
 from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file

 def option_parser():
    parser = lrf_parser('Usage: %prog [options] mybook.chm')
    parser.add_option(
        '-d', '--output-dir', default='.',
        help=_('Output directory. Defaults to current directory.'))
    return parser

 class CHMError(Exception):
    pass

 class CHMReader(CHMFile):
    def __init__(self, input):
        CHMFile.__init__(self)
        if not self.LoadCHM(input):
            raise CHMError("Unable to open CHM file '%s'"%(input,))
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False

        # we'll be creating two new files on top of the extracted stuff from
        # the CHM -- OPF metadata and NCX table of contents. Let's put them in
        # the same place as the '.hhc' file, which is the CHM TOC.
        self.root, ext = os.path.splitext(self.topics.lstrip('/'))
        self.opf_path = self.root + ".opf"
        self.ncx_path = self.root + ".ncx"

    def GetMetadata(self, basedir=os.getcwdu()):
        '''Gets meta-data from the CHM file into an OPFCreator object.
 Takes an optional 'basedir' argument, which is provided to the created
 meta-data objects so that they can work out relative paths.'''

        self.opf = OPFCreator(basedir, self.title)
        self.opf.title_sort = self._title_sort()

        # now, attempt to grab vaguely standard metadata from the "home" page.
        home = BeautifulSoup(self.GetFile(self.home))
        self._get_authors(home)
        self._get_publisher(home)
        self._get_isbn(home)
        self._get_comments(home)
        self._get_coverpath(home)

        self.opf.create_manifest(map(lambda x: (x, guess_mimetype(x)[0]), self.Contents()))
        tocsoup = BeautifulSoup(self.GetTopicsTree())
        self.toc = self._parse_toc(tocsoup.body.ul, basedir)
        # we are providing an ncx index too, so let's only put top-level
        # TOC stuff in the spine, for brevity's sake...
        self.opf.create_spine([item.href for item in self.toc if item.href])
        self.opf.set_toc(self.toc)
        self.opf.guide = self._create_guide(tocsoup, basedir)
        self._metadata = True

    def _title_sort(self):
        prefixes = ('a ', 'the ')
        ts = self.title
        for prefix in prefixes:
            if ts[0:len(prefix)].lower() == prefix:
                ts = ts[len(prefix):len(ts)]+", "+ts[0:len(prefix)-1]
        return ts

    def _metadata_from_table(self, soup, searchfor):
        td = soup.find('td', text=re.compile(searchfor, flags=re.I))
        if td is None:
            return None
        td = td.parent
        # there appears to be multiple ways of structuring the metadata
        # on the home page. cue some nasty special-case hacks...
        if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
            meta = self._detag(td.findNextSibling('td'))
            return re.sub('^:', '', meta).strip()
        else:
            meta = self._detag(td)
            return re.sub(r'^[^:]+:', '', meta).strip()

    def _metadata_from_span(self, soup, searchfor):
        span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
        if span is None:
            return None
        # this metadata might need some cleaning up still :/
        return span.renderContents().strip()

    def _get_authors(self, soup):
        aut = (self._metadata_from_span(soup, r'author')
            or self._metadata_from_table(soup, r'^\s*by\s*:?\s+'))
        if aut is None:
            self.opf.authors = [u'Unknown']
            self.opf.author_sort = u''
        else:
            aut = re.split(r'\s*(?:,|and)\s*',
                re.sub(re.compile(r'^\s*by:?\s*', flags=re.I), '', aut))
            self.opf.authors = aut
            aut = aut[0].split()
            # assume sorting by first named author's surname
            # and further that surname == name.split()[-1]
            self.opf.author_sort = aut[-1] + ', ' + ' '.join(aut[0:-1])

    def _get_publisher(self, soup):
        self.opf.publisher = (self._metadata_from_span(soup, 'imprint')
            or self._metadata_from_table(soup, 'publisher'))

    def _get_isbn(self, soup):
        isbn = (self._metadata_from_span(soup, 'isbn')
            or self._metadata_from_table(soup, 'isbn'))
        self.opf.isbn = re.sub(re.compile(r'^\s*isbn\s*\:', flags=re.I), '', isbn).strip()

    def _get_comments(self, soup):
        date = (self._metadata_from_span(soup, 'cwdate')
            or self._metadata_from_table(soup, 'pub date'))
        pages = (self. _metadata_from_span(soup, 'pages')
            or self._metadata_from_table(soup, 'pages'))
        try:
            # date span can have copyright symbols in it...
            date = date.replace(u'\u00a9', '').strip()
            # and pages often comes as '(\d+ pages)'
            pages = re.search(r'\d+', pages).group(0)
            self.opf.comments = u'Published %s, %s pages.' % (date, pages)
        except AttributeError:
            self.opf.comments = u''

    def _get_coverpath(self, soup):
        self.opf.cover     = None
        try:
            self.opf.cover = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
        except TypeError:
            # meeehh, no handy alt-tag goodness, try some hackery
            # the basic idea behind this is that in general, the cover image
            # has a height:width ratio of ~1.25, whereas most of the nav
            # buttons are decidedly less than that.
            # what we do in this is work out that ratio, take 1.25 off it and
            # save the absolute value when we sort by this value, the smallest
            # one is most likely to be the cover image, hopefully.
            r = {}
            for img in soup('img'):
                try:
                    r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
                except KeyError:
                    # interestingly, occasionally the only image without height
                    # or width attrs is the cover...
                    r[0] = img['src']
            l = r.keys()
            l.sort()
            self.opf.cover = r[l[0]]
        # this link comes from the internal html, which is in a subdir
        if self.opf.cover is not None:
            self.opf.cover = self.root + "/" + self.opf.cover

    def _create_guide(self, soup, basedir=os.getcwdu()):
        guide = Guide()
        guide.set_basedir(basedir)
        titlepage = Guide.Reference(self.home.lstrip('/'), basedir)
        titlepage.title = u'About this E-Book'
        titlepage.type = u'title-page'
        guide.append(titlepage)
        # let's try and get useful guide things from our toc soup
        # map the guide type attribute to name and search regex
        map = {
           'toc': [u'Table of Contents', '(?:table of )?contents?'],
           'copyright-page': [u'Copyright', 'copyright'],
           'dedication': [u'Dedication', 'dedication'],
           'preface': [u'Preface', 'preface'],
           'foreword': [u'Foreword', 'foreword'],
           'acknowledgements': [u'Acknowledgements', 'acknowledgements'],
           'bibliography': [u'Bibliography', 'bibliography'],
           'index': [u'Index', 'index'],
           'glossary': [u'Glossary', 'glossary'],
           'colophon': [u'Colophon', 'colophon'],
           'text': [u'Start of Content', 'chapter 1'],
        }
        for type, name in map.items():
            obj = soup.find('param', {
                'name': 'Name',
                'value': re.compile(name[1], re.I)
            })
            if obj is None: continue
            href = obj.parent.find('param', {'name': 'Local'})['value']
            ref = Guide.Reference(href, basedir)
            ref.title = name[0]
            ref.type = type
            guide.append(ref)
        return guide

    def _parse_toc(self, ul, basedir=os.getcwdu()):
        toc = TOC(play_order=self._playorder, base_path=basedir)
        self._playorder += 1
        for li in ul('li', recursive=False):
            href = li.object('param', {'name': 'Local'})[0]['value']
            if href.count('#'):
                href, frag = href.split('#')
            else:
                frag = None
            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
            toc.add_item(href, frag, name, play_order=self._playorder)
            self._playorder += 1
            if li.ul:
               child = self._parse_toc(li.ul)
               child.parent = toc
               toc.append(child)
        return toc

    def _detag(self, tag):
        str = ""
        for elem in tag:
            if hasattr(elem, "contents"):
                str += self._detag(elem)
            else:
                str += self._deentity(elem)
        return str

    def _deentity(self, elem):
        def replace_entity(m):
            if m.group(1)=='#':
                try:
                    return unichr(int(m.group(2)))
                except ValueError:
                    return '&#%s;' % m.group(2)
            try:
                return unichr(name2codepoint[m.group(2)])
            except KeyError:
                return '&%s;' % m.group(2)
        # rargh nbsp => \xa0, not a real space
        return re.sub(r'\s+', ' ', re.sub(r'&(#?)([^;]+);', replace_entity, elem).replace(u'\u00a0', ' '))

    def GetFile(self, path):
        # have to have abs paths for ResolveObject, but Contents() deliberately
        # makes them relative. So we don't have to worry, re-add the leading /.
        if path[0] != '/':
            path = '/' + path
        res, ui = self.ResolveObject(path)
        if res != CHM_RESOLVE_SUCCESS:
            raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
        size, data = self.RetrieveObject(ui)
        if size == 0:
            raise CHMError("'%s' is zero bytes in length!"%(path,))
        return data

    def ExtractFiles(self, output_dir=os.getcwdu()):
        for path in self.Contents():
            lpath = os.path.join(output_dir, path)
            self._ensure_dir(lpath)
            data = self.GetFile(path)
            with open(lpath, 'wb') as f:
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
        self._extracted = True

    def _reformat(self, data):
        try:
            html = BeautifulSoup(data)
        except UnicodeEncodeError:
            # hit some strange encoding problems...
            print "Unable to parse html for cleaning, leaving it :("
            return data
        # nuke javascript...
        [s.extract() for s in html('script')]
        # remove forward and back nav bars from the top/bottom of each page
        # cos they really fuck with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        t = html('table')
        if t:
            if (t[0].previousSibling is None
              or t[0].previousSibling.previousSibling is None):
                t[0].extract()
            if (t[-1].nextSibling is None
              or t[-1].nextSibling.nextSibling is None):
                t[-1].extract()
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.

        # some images seem to be broken in some chm's :/
        for img in html('img'):
            try:
                # some are supposedly "relative"... lies.
                while img['src'].startswith('../'): img['src'] = img['src'][3:]
                # some have ";<junk>" at the end.
                img['src'] = img['src'].split(';')[0]
            except KeyError:
                # and some don't even have a src= ?!
                pass
        # now give back some pretty html.
        return html.prettify()

    def Contents(self):
        if self._contents is not None:
            return self._contents
        paths = []
        def get_paths(chm, ui, ctx):
            # skip directories
            if ui.path[-1] != '/':
                # and make paths relative
                paths.append(ui.path.lstrip('/'))
        chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
        self._contents = paths
        return self._contents

    def _ensure_dir(self, path):
        dir = os.path.dirname(path)
        if not os.path.isdir(dir):
            os.makedirs(dir)

    def CreateMetafiles(self, output_dir=os.getcwdu()):
        if not self._metadata:
            self.GetMetadata(basedir=output_dir)
        self._ensure_dir(output_dir)
        opf_fd = open(os.path.join(output_dir, self.opf_path), 'wb')
        ncx_fd = open(os.path.join(output_dir, self.ncx_path), 'wb')
        self.opf.render(opf_fd, ncx_fd, self.ncx_path)
        opf_fd.close()
        ncx_fd.close()

    def extract_content(self, output_dir=os.getcwdu()):
        self.ExtractFiles(output_dir=output_dir)
        self.CreateMetafiles(output_dir=output_dir)

 def process_file(f, options, logger):
    tdir = mkdtemp(prefix='chm2oeb_')
    f = os.path.abspath(os.path.expanduser(f))
    if not options.output:
        ext = '.lrs' if options.lrs else '.lrf'
        options.output = os.path.splitext(f)[0] + ext
    rdr = CHMReader(f)
    print "Extracting CHM to ", tdir
    rdr.extract_content(tdir)
    options.opf = os.path.join(tdir, rdr.opf_path)
    try:
        html_process_file(os.path.join(tdir, rdr.home.lstrip('/')), options)
    finally:
        try:
            shutil.rmtree(tdir)
        except:
            print "Failed to delete tempdir ", tdir


 def main(args=sys.argv, logger=None):
    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) != 2:
        parser.print_help()
        print
        print "FAIL: provide a CHM file as an argument!"
        return 1
    process_file(args[1], options, logger)
 #    tdir = mkdtemp(prefix='chm2oeb_', dir='.')
 #    rdr = CHMReader(args[1])
 #    rdr.extract_content(tdir)
    return 0

 if __name__ == '__main__':
    sys.exit(main())
	from __future__ import with_statement
	''' CHM File decoding support '''
	__license__ = 'GPL v3'
	__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
	' and Alex Bramley <a.bramley at gmail.com>.'

	import sys, os, re, shutil
	from tempfile import mkdtemp
	from mimetypes import guess_type as guess_mimetype
	from htmlentitydefs import name2codepoint
	from pprint import PrettyPrinter

	from BeautifulSoup import BeautifulSoup
	from chm.chm import CHMFile
	from chm.chmlib import (
	CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
	chm_enumerate, chm_retrieve_object,
	)

	from calibre.ebooks.lrf import option_parser as lrf_parser
	from calibre.ebooks.metadata import MetaInformation
	from calibre.ebooks.metadata.opf import OPFCreator, Guide
	from calibre.ebooks.metadata.toc import TOC
	from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file

	def option_parser():
	parser = lrf_parser('Usage: %prog [options] mybook.chm')
	parser.add_option(
	'-d', '--output-dir', default='.',
	help=_('Output directory. Defaults to current directory.'))
	return parser

	class CHMError(Exception):
	pass

	class CHMReader(CHMFile):
	def __init__(self, input):
	CHMFile.__init__(self)
	if not self.LoadCHM(input):
	raise CHMError("Unable to open CHM file '%s'"%(input,))
	self._contents = None
	self._playorder = 0
	self._metadata = False
	self._extracted = False

	# we'll be creating two new files on top of the extracted stuff from
	# the CHM -- OPF metadata and NCX table of contents. Let's put them in
	# the same place as the '.hhc' file, which is the CHM TOC.
	self.root, ext = os.path.splitext(self.topics.lstrip('/'))
	self.opf_path = self.root + ".opf"
	self.ncx_path = self.root + ".ncx"

	def GetMetadata(self, basedir=os.getcwdu()):
	'''Gets meta-data from the CHM file into an OPFCreator object.
	Takes an optional 'basedir' argument, which is provided to the created
	meta-data objects so that they can work out relative paths.'''

	self.opf = OPFCreator(basedir, self.title)
	self.opf.title_sort = self._title_sort()

	# now, attempt to grab vaguely standard metadata from the "home" page.
	home = BeautifulSoup(self.GetFile(self.home))
	self._get_authors(home)
	self._get_publisher(home)
	self._get_isbn(home)
	self._get_comments(home)
	self._get_coverpath(home)

	self.opf.create_manifest(map(lambda x: (x, guess_mimetype(x)[0]), self.Contents()))
	tocsoup = BeautifulSoup(self.GetTopicsTree())
	self.toc = self._parse_toc(tocsoup.body.ul, basedir)
	# we are providing an ncx index too, so let's only put top-level
	# TOC stuff in the spine, for brevity's sake...
	self.opf.create_spine([item.href for item in self.toc if item.href])
	self.opf.set_toc(self.toc)
	self.opf.guide = self._create_guide(tocsoup, basedir)
	self._metadata = True

	def _title_sort(self):
	prefixes = ('a ', 'the ')
	ts = self.title
	for prefix in prefixes:
	if ts[0:len(prefix)].lower() == prefix:
	ts = ts[len(prefix):len(ts)]+", "+ts[0:len(prefix)-1]
	return ts

	def _metadata_from_table(self, soup, searchfor):
	td = soup.find('td', text=re.compile(searchfor, flags=re.I))
	if td is None:
	return None
	td = td.parent
	# there appears to be multiple ways of structuring the metadata
	# on the home page. cue some nasty special-case hacks...
	if re.match(r'^\s'+searchfor+r'\s$', td.renderContents(), flags=re.I):
	meta = self._detag(td.findNextSibling('td'))
	return re.sub('^:', '', meta).strip()
	else:
	meta = self._detag(td)
	return re.sub(r'^[^:]+:', '', meta).strip()

	def _metadata_from_span(self, soup, searchfor):
	span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
	if span is None:
	return None
	# this metadata might need some cleaning up still :/
	return span.renderContents().strip()

	def _get_authors(self, soup):
	aut = (self._metadata_from_span(soup, r'author')
	or self._metadata_from_table(soup, r'^\sby\s:?\s+'))
	if aut is None:
	self.opf.authors = [u'Unknown']
	self.opf.author_sort = u''
	else:
	aut = re.split(r'\s(?:,\|and)\s',
	re.sub(re.compile(r'^\sby:?\s', flags=re.I), '', aut))
	self.opf.authors = aut
	aut = aut[0].split()
	# assume sorting by first named author's surname
	# and further that surname == name.split()[-1]
	self.opf.author_sort = aut[-1] + ', ' + ' '.join(aut[0:-1])

	def _get_publisher(self, soup):
	self.opf.publisher = (self._metadata_from_span(soup, 'imprint')
	or self._metadata_from_table(soup, 'publisher'))

	def _get_isbn(self, soup):
	isbn = (self._metadata_from_span(soup, 'isbn')
	or self._metadata_from_table(soup, 'isbn'))
	self.opf.isbn = re.sub(re.compile(r'^\sisbn\s\:', flags=re.I), '', isbn).strip()

	def _get_comments(self, soup):
	date = (self._metadata_from_span(soup, 'cwdate')
	or self._metadata_from_table(soup, 'pub date'))
	pages = (self. _metadata_from_span(soup, 'pages')
	or self._metadata_from_table(soup, 'pages'))
	try:
	# date span can have copyright symbols in it...
	date = date.replace(u'\u00a9', '').strip()
	# and pages often comes as '(\d+ pages)'
	pages = re.search(r'\d+', pages).group(0)
	self.opf.comments = u'Published %s, %s pages.' % (date, pages)
	except AttributeError:
	self.opf.comments = u''

	def _get_coverpath(self, soup):
	self.opf.cover = None
	try:
	self.opf.cover = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
	except TypeError:
	# meeehh, no handy alt-tag goodness, try some hackery
	# the basic idea behind this is that in general, the cover image
	# has a height:width ratio of ~1.25, whereas most of the nav
	# buttons are decidedly less than that.
	# what we do in this is work out that ratio, take 1.25 off it and
	# save the absolute value when we sort by this value, the smallest
	# one is most likely to be the cover image, hopefully.
	r = {}
	for img in soup('img'):
	try:
	r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
	except KeyError:
	# interestingly, occasionally the only image without height
	# or width attrs is the cover...
	r[0] = img['src']
	l = r.keys()
	l.sort()
	self.opf.cover = r[l[0]]
	# this link comes from the internal html, which is in a subdir
	if self.opf.cover is not None:
	self.opf.cover = self.root + "/" + self.opf.cover

	def _create_guide(self, soup, basedir=os.getcwdu()):
	guide = Guide()
	guide.set_basedir(basedir)
	titlepage = Guide.Reference(self.home.lstrip('/'), basedir)
	titlepage.title = u'About this E-Book'
	titlepage.type = u'title-page'
	guide.append(titlepage)
	# let's try and get useful guide things from our toc soup
	# map the guide type attribute to name and search regex
	map = {
	'toc': [u'Table of Contents', '(?:table of )?contents?'],
	'copyright-page': [u'Copyright', 'copyright'],
	'dedication': [u'Dedication', 'dedication'],
	'preface': [u'Preface', 'preface'],
	'foreword': [u'Foreword', 'foreword'],
	'acknowledgements': [u'Acknowledgements', 'acknowledgements'],
	'bibliography': [u'Bibliography', 'bibliography'],
	'index': [u'Index', 'index'],
	'glossary': [u'Glossary', 'glossary'],
	'colophon': [u'Colophon', 'colophon'],
	'text': [u'Start of Content', 'chapter 1'],
	}
	for type, name in map.items():
	obj = soup.find('param', {
	'name': 'Name',
	'value': re.compile(name[1], re.I)
	})
	if obj is None: continue
	href = obj.parent.find('param', {'name': 'Local'})['value']
	ref = Guide.Reference(href, basedir)
	ref.title = name[0]
	ref.type = type
	guide.append(ref)
	return guide

	def _parse_toc(self, ul, basedir=os.getcwdu()):
	toc = TOC(play_order=self._playorder, base_path=basedir)
	self._playorder += 1
	for li in ul('li', recursive=False):
	href = li.object('param', {'name': 'Local'})[0]['value']
	if href.count('#'):
	href, frag = href.split('#')
	else:
	frag = None
	name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
	toc.add_item(href, frag, name, play_order=self._playorder)
	self._playorder += 1
	if li.ul:
	child = self._parse_toc(li.ul)
	child.parent = toc
	toc.append(child)
	return toc

	def _detag(self, tag):
	str = ""
	for elem in tag:
	if hasattr(elem, "contents"):
	str += self._detag(elem)
	else:
	str += self._deentity(elem)
	return str

	def _deentity(self, elem):
	def replace_entity(m):
	if m.group(1)=='#':
	try:
	return unichr(int(m.group(2)))
	except ValueError:
	return '&#%s;' % m.group(2)
	try:
	return unichr(name2codepoint[m.group(2)])
	except KeyError:
	return '&%s;' % m.group(2)
	# rargh nbsp => \xa0, not a real space
	return re.sub(r'\s+', ' ', re.sub(r'&(#?)([^;]+);', replace_entity, elem).replace(u'\u00a0', ' '))

	def GetFile(self, path):
	# have to have abs paths for ResolveObject, but Contents() deliberately
	# makes them relative. So we don't have to worry, re-add the leading /.
	if path[0] != '/':
	path = '/' + path
	res, ui = self.ResolveObject(path)
	if res != CHM_RESOLVE_SUCCESS:
	raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
	size, data = self.RetrieveObject(ui)
	if size == 0:
	raise CHMError("'%s' is zero bytes in length!"%(path,))
	return data

	def ExtractFiles(self, output_dir=os.getcwdu()):
	for path in self.Contents():
	lpath = os.path.join(output_dir, path)
	self._ensure_dir(lpath)
	data = self.GetFile(path)
	with open(lpath, 'wb') as f:
	if guess_mimetype(path)[0] == ('text/html'):
	data = self._reformat(data)
	f.write(data)
	self._extracted = True

	def _reformat(self, data):
	try:
	html = BeautifulSoup(data)
	except UnicodeEncodeError:
	# hit some strange encoding problems...
	print "Unable to parse html for cleaning, leaving it :("
	return data
	# nuke javascript...
	[s.extract() for s in html('script')]
	# remove forward and back nav bars from the top/bottom of each page
	# cos they really fuck with the flow of things and generally waste space
	# since we can't use [a,b] syntax to select arbitrary items from a list
	# we'll have to do this manually...
	t = html('table')
	if t:
	if (t[0].previousSibling is None
	or t[0].previousSibling.previousSibling is None):
	t[0].extract()
	if (t[-1].nextSibling is None
	or t[-1].nextSibling.nextSibling is None):
	t[-1].extract()
	# for some very odd reason each page's content appears to be in a table
	# too. and this table has sub-tables for random asides... grr.

	# some images seem to be broken in some chm's :/
	for img in html('img'):
	try:
	# some are supposedly "relative"... lies.
	while img['src'].startswith('../'): img['src'] = img['src'][3:]
	# some have ";<junk>" at the end.
	img['src'] = img['src'].split(';')[0]
	except KeyError:
	# and some don't even have a src= ?!
	pass
	# now give back some pretty html.
	return html.prettify()

	def Contents(self):
	if self._contents is not None:
	return self._contents
	paths = []
	def get_paths(chm, ui, ctx):
	# skip directories
	if ui.path[-1] != '/':
	# and make paths relative
	paths.append(ui.path.lstrip('/'))
	chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
	self._contents = paths
	return self._contents

	def _ensure_dir(self, path):
	dir = os.path.dirname(path)
	if not os.path.isdir(dir):
	os.makedirs(dir)

	def CreateMetafiles(self, output_dir=os.getcwdu()):
	if not self._metadata:
	self.GetMetadata(basedir=output_dir)
	self._ensure_dir(output_dir)
	opf_fd = open(os.path.join(output_dir, self.opf_path), 'wb')
	ncx_fd = open(os.path.join(output_dir, self.ncx_path), 'wb')
	self.opf.render(opf_fd, ncx_fd, self.ncx_path)
	opf_fd.close()
	ncx_fd.close()

	def extract_content(self, output_dir=os.getcwdu()):
	self.ExtractFiles(output_dir=output_dir)
	self.CreateMetafiles(output_dir=output_dir)

	def process_file(f, options, logger):
	tdir = mkdtemp(prefix='chm2oeb_')
	f = os.path.abspath(os.path.expanduser(f))
	if not options.output:
	ext = '.lrs' if options.lrs else '.lrf'
	options.output = os.path.splitext(f)[0] + ext
	rdr = CHMReader(f)
	print "Extracting CHM to ", tdir
	rdr.extract_content(tdir)
	options.opf = os.path.join(tdir, rdr.opf_path)
	try:
	html_process_file(os.path.join(tdir, rdr.home.lstrip('/')), options)
	finally:
	try:
	shutil.rmtree(tdir)
	except:
	print "Failed to delete tempdir ", tdir


	def main(args=sys.argv, logger=None):
	parser = option_parser()
	options, args = parser.parse_args(args)
	if len(args) != 2:
	parser.print_help()
	print
	print "FAIL: provide a CHM file as an argument!"
	return 1
	process_file(args[1], options, logger)
	# tdir = mkdtemp(prefix='chm2oeb_', dir='.')
	# rdr = CHMReader(args[1])
	# rdr.extract_content(tdir)
	return 0

	if __name__ == '__main__':
	sys.exit(main())