Skip to content

Instantly share code, notes, and snippets.

@Yasushi
Created July 11, 2018 04:59
Show Gist options
  • Save Yasushi/91ee2d1fa6aca8abbe22a7eea3e8e126 to your computer and use it in GitHub Desktop.
Save Yasushi/91ee2d1fa6aca8abbe22a7eea3e8e126 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8; -*-
import sys, os, glob, itertools, re, functools, pprint
from collections import OrderedDict
from lxml import etree
from ebooklib import epub
def xpath(query):
return etree.XPath(query, namespaces={'h':'http://www.w3.org/1999/xhtml', 're':'http://exslt.org/regular-expressions'})
def xpath_class(cls, name="*", append=""):
return xpath(".//%s[re:test(@class, '\\b%s\\b')]%s"%(name, cls, append))
def xpath_id(id, name="*", append=""):
return lambda h: next(iter(xpath(".//%s[@id='%s']%s"%(name, id, append))(h)), None)
class NTOC:
TEMPLATE=u'''\
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja">
<head>
<title>title</title>
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>
</body>
</html>
'''
def __init__(self, path):
self.dom = etree.parse(open(path), etree.HTMLParser())
self.identifier = self.dom.xpath('.//link[@rel="alternate"][@media="handheld"]/@href')[0].split('/')[-2]
self.require_font = self.identifier == 'n4955ee'
self.chapter_index = None
self.series_title = "".join(xpath_class('series_title', append='//text()')(self.dom))
self.novel_title = "".join(xpath_class('novel_title', append='//text()')(self.dom))
self.novel_writername = "".join(xpath_class('novel_writername', append='/a/text()')(self.dom))
self.novel_ex = xpath_id('novel_ex')(self.dom)
def parse_index_box(box, elem):
if elem.tag == 'div':
box.insert(0, (elem.text, []))
elif elem.tag == 'dl':
a=xpath('./dd/a')(elem)[0]
href=re.sub(r'^/|/$', '', a.attrib['href'])
date = "".join([s.strip() for s in xpath('./dt/text()')(elem)])
revdate = "".join([s.strip() for s in xpath('./dt/span/@title')(elem)])
if revdate:
date="%s (%s)"%(date, revdate)
box[0][1].append((href,a.text, date))
return box
self.index_box = functools.reduce(parse_index_box, xpath_class('index_box')(self.dom)[0], [("", [])])
self.index_box = [i for i in self.index_box if i[1]]
self.index_box.reverse()
self.pages=[]
for (chapter, pages) in self.index_box:
for (href, subtitle, date) in pages:
self.pages.append(Item(href))
self.pagemap = dict([(i.origpath, i) for i in self.pages])
def make_toc(self, chapter_index=None):
self.chapter_index=chapter_index
self.toc=TOC()
for (i, (chapter, pages)) in enumerate(self.index_box):
if self.chapter_index is not None and self.chapter_index != i:
continue
first=True
c=self.toc
for (href, subtitle, date) in pages:
if first and chapter: # and self.chapter_index is None:
first=False
c=self.toc.add(chapter, self.pagemap[href].path)
page = self.pagemap[href]
c.add(page.subtitle, page.path, 'novel_no')
self.htmltoc = self.generate_htmltoc()
t = TOC(u'目次', self.htmltoc[0])
t.parent=self.toc
self.toc.children.insert(0, t)
self.metadata = Metadata(self.title(), authors=(self.novel_writername,))
def coverpage(self):
dom = etree.fromstring(self.TEMPLATE)
title=xpath('//h:title')(dom)[0]
title.text=self.title()
body=dom.find('{http://www.w3.org/1999/xhtml}body')
h1=etree.SubElement(body, 'h1')
h1.text=self.novel_title
if self.chapter_index is not None:
h2=etree.SubElement(body, 'h2')
h2.text=self.selected_chapter_title()
body.append(self.novel_ex)
return etree.tostring(dom, encoding='utf-8',
xml_declaration=True,
pretty_print=False,
doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
with_tail=True)
def has_chaps(self):
return len(self.index_box) > 1
def title(self):
if self.has_chaps() and self.chapter_index is not None:
return "%s %s - %s"%(self.novel_title, self.index_box[self.chapter_index][0], self.series_title)
else:
return "%s - %s"%(self.novel_title, self.series_title)
def selected_chapter_title(self):
if self.has_chaps() and self.chapter_index is not None:
return self.index_box[self.chapter_index][0]
else:
return ''
def pagelist(self):
if self.has_chaps() and self.chapter_index is not None:
ret=[]
for (href, subtitle, date) in self.index_box[self.chapter_index][1]:
ret.append(self.pagemap[href])
return ret
else:
return self.pages
def epubname(self, dir="."):
if self.has_chaps() and self.chapter_index is not None:
return os.path.join(dir, "%s %d %s.epub"%(self.novel_title, self.chapter_index + 1, self.index_box[self.chapter_index][0]))
else:
return os.path.join(dir, self.novel_title+u'.epub')
class Item:
def __init__(self, path):
self.origpath = path
self.srcpath = path
if os.path.isdir(self.srcpath):
self.srcpath = os.path.join(self.srcpath, 'index.html')
(directory, filename) = os.path.split(self.srcpath)
(basename,ext) = os.path.splitext(filename)
if basename == 'index':
(directory, basename) = os.path.split(directory)
if ext == '' or ext == '.html':
ext = '.xhtml'
self.path = os.path.join(directory, basename + ext)
self.id = self.path.replace("/", "_")
self.dom = self.readHTML()
self.no = xpath_id('novel_no', append="/text()")(self.dom)
self.title = xpath('//title')(self.dom)[0].text.split(' - ')[0]
self.chapter = "".join(xpath_class('chapter_title', name='p', append="/text()")(self.dom))
self.subtitle = "".join(xpath_class("novel_subtitle", append="/text()")(self.dom))
self.html = etree.tostring(self.dom, encoding='utf-8',
xml_declaration=True,
pretty_print=False,
doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">',
with_tail=True)
def readHTML(self):
h = etree.parse(open(self.srcpath), etree.HTMLParser()).getroot()
# del h.attrib['lang']
etree.strip_elements(h, 'script', 'meta', 'link', etree.Comment)
etree.strip_attributes(h, 'onload', 'onclick')
etree.strip_tags(h, 'rb')
for e in xpath_class('contents1')(h[1]):
etree.strip_tags(e, 'a')
for e in h[1].xpath('./*[not(@id="container")]'):
h[1].remove(e)
for e in h[1].xpath('.//*[contains(@class,"koukoku")]'):
e.getparent().remove(e)
for sel in ['toaster', 'narou_modal', 'novel_bn', 'twitter-share-button']:
for e in xpath_class(sel)(h[1]):
e.getparent().remove(e)
for sel in ['novel_attention', 'novel_footer', 'novel_hyouka', 'impression', 'recommend', 'review', 'pageTop']:
e = xpath_id(sel)(h[1])
if e is not None:
e.getparent().remove(e)
return h
def create_book(ntoc):
book = epub.EpubBook()
# set metadata
book.set_identifier(ntoc.identifier)
book.set_title(ntoc.title())
book.set_language('ja')
book.add_author(ntoc.novel_writername)
intro=epub.EpubHtml(uid='intro', title=book.title, file_name="intro.xhtml", content=ntoc.coverpage())
intro.add_link(href='style.css', rel='stylesheet', type='text/css')
book.add_item(intro)
# create chapter
def _create_chapter(item):
c=epub.EpubHtml(uid=item.id, title=item.subtitle, file_name=item.path, content=item.html)
c.add_link(href='../style.css', rel='stylesheet', type='text/css')
return c
chaps = OrderedDict([(i.origpath, _create_chapter(i)) for i in ntoc.pages])
# add chapter
for c in chaps.values():
book.add_item(c)
# define Table Of Contents
# book.toc = (epub.Link('chap_01.xhtml', 'Introduction はじめに', 'intro'),
# (epub.Section('Simple book しんぷるぶっく'),
# (c1, ))
# )
root=[intro]
cur=root
for (sec, children) in ntoc.index_box:
if len(sec) > 0:
cur=[]
s=[epub.Section(sec),cur]
root.append(s)
for (path, subtitle, date) in children:
c=chaps[path]
cur.append(c)
#cur.append(epub.Link(c.file_name, "{!s} ({!s})".format(subtitle, date), uid=c.id))
book.toc = root
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
nav=epub.EpubNav()
nav.add_link(href='style.css', rel='stylesheet', type='text/css')
book.add_item(nav)
# define CSS style
css = epub.EpubItem(uid="style", file_name="style.css", media_type="text/css", content=open("style.css").read())
if ntoc.require_font:
font=epub.EpubItem(uid="font", file_name="SourceHanSerif-Medium.otf", media_type="application/x-font-otf", content=open("SourceHanSerif-Medium.otf", "rb").read())
book.add_item(font)
content = """\
@font-face {
font-family: han;
src: url("SourceHanSerif-Medium.otf");
}
""" + css.get_content().replace('font-family: serif;', 'font-family: han;')
css.set_content(content)
book.add_item(css)
# basic spine
book.spine = [intro, 'nav', *chaps.values()]
# write to the file
epub.write_epub(ntoc.epubname(dir='eb'), book, {})
print(ntoc.epubname(dir='eb'))
if __name__ == '__main__':
if len(sys.argv) > 1:
ntoc = NTOC(sys.argv[-1])
create_book(ntoc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment