Created
July 11, 2018 04:59
-
-
Save Yasushi/91ee2d1fa6aca8abbe22a7eea3e8e126 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8; -*- | |
import sys, os, glob, itertools, re, functools, pprint | |
from collections import OrderedDict | |
from lxml import etree | |
from ebooklib import epub | |
def xpath(query): | |
return etree.XPath(query, namespaces={'h':'http://www.w3.org/1999/xhtml', 're':'http://exslt.org/regular-expressions'}) | |
def xpath_class(cls, name="*", append=""): | |
return xpath(".//%s[re:test(@class, '\\b%s\\b')]%s"%(name, cls, append)) | |
def xpath_id(id, name="*", append=""): | |
return lambda h: next(iter(xpath(".//%s[@id='%s']%s"%(name, id, append))(h)), None) | |
class NTOC: | |
TEMPLATE=u'''\ | |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> | |
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja"> | |
<head> | |
<title>title</title> | |
<link rel="stylesheet" type="text/css" href="style.css" /> | |
</head> | |
<body> | |
</body> | |
</html> | |
''' | |
def __init__(self, path): | |
self.dom = etree.parse(open(path), etree.HTMLParser()) | |
self.identifier = self.dom.xpath('.//link[@rel="alternate"][@media="handheld"]/@href')[0].split('/')[-2] | |
self.require_font = self.identifier == 'n4955ee' | |
self.chapter_index = None | |
self.series_title = "".join(xpath_class('series_title', append='//text()')(self.dom)) | |
self.novel_title = "".join(xpath_class('novel_title', append='//text()')(self.dom)) | |
self.novel_writername = "".join(xpath_class('novel_writername', append='/a/text()')(self.dom)) | |
self.novel_ex = xpath_id('novel_ex')(self.dom) | |
def parse_index_box(box, elem): | |
if elem.tag == 'div': | |
box.insert(0, (elem.text, [])) | |
elif elem.tag == 'dl': | |
a=xpath('./dd/a')(elem)[0] | |
href=re.sub(r'^/|/$', '', a.attrib['href']) | |
date = "".join([s.strip() for s in xpath('./dt/text()')(elem)]) | |
revdate = "".join([s.strip() for s in xpath('./dt/span/@title')(elem)]) | |
if revdate: | |
date="%s (%s)"%(date, revdate) | |
box[0][1].append((href,a.text, date)) | |
return box | |
self.index_box = functools.reduce(parse_index_box, xpath_class('index_box')(self.dom)[0], [("", [])]) | |
self.index_box = [i for i in self.index_box if i[1]] | |
self.index_box.reverse() | |
self.pages=[] | |
for (chapter, pages) in self.index_box: | |
for (href, subtitle, date) in pages: | |
self.pages.append(Item(href)) | |
self.pagemap = dict([(i.origpath, i) for i in self.pages]) | |
def make_toc(self, chapter_index=None): | |
self.chapter_index=chapter_index | |
self.toc=TOC() | |
for (i, (chapter, pages)) in enumerate(self.index_box): | |
if self.chapter_index is not None and self.chapter_index != i: | |
continue | |
first=True | |
c=self.toc | |
for (href, subtitle, date) in pages: | |
if first and chapter: # and self.chapter_index is None: | |
first=False | |
c=self.toc.add(chapter, self.pagemap[href].path) | |
page = self.pagemap[href] | |
c.add(page.subtitle, page.path, 'novel_no') | |
self.htmltoc = self.generate_htmltoc() | |
t = TOC(u'目次', self.htmltoc[0]) | |
t.parent=self.toc | |
self.toc.children.insert(0, t) | |
self.metadata = Metadata(self.title(), authors=(self.novel_writername,)) | |
def coverpage(self): | |
dom = etree.fromstring(self.TEMPLATE) | |
title=xpath('//h:title')(dom)[0] | |
title.text=self.title() | |
body=dom.find('{http://www.w3.org/1999/xhtml}body') | |
h1=etree.SubElement(body, 'h1') | |
h1.text=self.novel_title | |
if self.chapter_index is not None: | |
h2=etree.SubElement(body, 'h2') | |
h2.text=self.selected_chapter_title() | |
body.append(self.novel_ex) | |
return etree.tostring(dom, encoding='utf-8', | |
xml_declaration=True, | |
pretty_print=False, | |
doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">', | |
with_tail=True) | |
def has_chaps(self): | |
return len(self.index_box) > 1 | |
def title(self): | |
if self.has_chaps() and self.chapter_index is not None: | |
return "%s %s - %s"%(self.novel_title, self.index_box[self.chapter_index][0], self.series_title) | |
else: | |
return "%s - %s"%(self.novel_title, self.series_title) | |
def selected_chapter_title(self): | |
if self.has_chaps() and self.chapter_index is not None: | |
return self.index_box[self.chapter_index][0] | |
else: | |
return '' | |
def pagelist(self): | |
if self.has_chaps() and self.chapter_index is not None: | |
ret=[] | |
for (href, subtitle, date) in self.index_box[self.chapter_index][1]: | |
ret.append(self.pagemap[href]) | |
return ret | |
else: | |
return self.pages | |
def epubname(self, dir="."): | |
if self.has_chaps() and self.chapter_index is not None: | |
return os.path.join(dir, "%s %d %s.epub"%(self.novel_title, self.chapter_index + 1, self.index_box[self.chapter_index][0])) | |
else: | |
return os.path.join(dir, self.novel_title+u'.epub') | |
class Item: | |
def __init__(self, path): | |
self.origpath = path | |
self.srcpath = path | |
if os.path.isdir(self.srcpath): | |
self.srcpath = os.path.join(self.srcpath, 'index.html') | |
(directory, filename) = os.path.split(self.srcpath) | |
(basename,ext) = os.path.splitext(filename) | |
if basename == 'index': | |
(directory, basename) = os.path.split(directory) | |
if ext == '' or ext == '.html': | |
ext = '.xhtml' | |
self.path = os.path.join(directory, basename + ext) | |
self.id = self.path.replace("/", "_") | |
self.dom = self.readHTML() | |
self.no = xpath_id('novel_no', append="/text()")(self.dom) | |
self.title = xpath('//title')(self.dom)[0].text.split(' - ')[0] | |
self.chapter = "".join(xpath_class('chapter_title', name='p', append="/text()")(self.dom)) | |
self.subtitle = "".join(xpath_class("novel_subtitle", append="/text()")(self.dom)) | |
self.html = etree.tostring(self.dom, encoding='utf-8', | |
xml_declaration=True, | |
pretty_print=False, | |
doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">', | |
with_tail=True) | |
def readHTML(self): | |
h = etree.parse(open(self.srcpath), etree.HTMLParser()).getroot() | |
# del h.attrib['lang'] | |
etree.strip_elements(h, 'script', 'meta', 'link', etree.Comment) | |
etree.strip_attributes(h, 'onload', 'onclick') | |
etree.strip_tags(h, 'rb') | |
for e in xpath_class('contents1')(h[1]): | |
etree.strip_tags(e, 'a') | |
for e in h[1].xpath('./*[not(@id="container")]'): | |
h[1].remove(e) | |
for e in h[1].xpath('.//*[contains(@class,"koukoku")]'): | |
e.getparent().remove(e) | |
for sel in ['toaster', 'narou_modal', 'novel_bn', 'twitter-share-button']: | |
for e in xpath_class(sel)(h[1]): | |
e.getparent().remove(e) | |
for sel in ['novel_attention', 'novel_footer', 'novel_hyouka', 'impression', 'recommend', 'review', 'pageTop']: | |
e = xpath_id(sel)(h[1]) | |
if e is not None: | |
e.getparent().remove(e) | |
return h | |
def create_book(ntoc): | |
book = epub.EpubBook() | |
# set metadata | |
book.set_identifier(ntoc.identifier) | |
book.set_title(ntoc.title()) | |
book.set_language('ja') | |
book.add_author(ntoc.novel_writername) | |
intro=epub.EpubHtml(uid='intro', title=book.title, file_name="intro.xhtml", content=ntoc.coverpage()) | |
intro.add_link(href='style.css', rel='stylesheet', type='text/css') | |
book.add_item(intro) | |
# create chapter | |
def _create_chapter(item): | |
c=epub.EpubHtml(uid=item.id, title=item.subtitle, file_name=item.path, content=item.html) | |
c.add_link(href='../style.css', rel='stylesheet', type='text/css') | |
return c | |
chaps = OrderedDict([(i.origpath, _create_chapter(i)) for i in ntoc.pages]) | |
# add chapter | |
for c in chaps.values(): | |
book.add_item(c) | |
# define Table Of Contents | |
# book.toc = (epub.Link('chap_01.xhtml', 'Introduction はじめに', 'intro'), | |
# (epub.Section('Simple book しんぷるぶっく'), | |
# (c1, )) | |
# ) | |
root=[intro] | |
cur=root | |
for (sec, children) in ntoc.index_box: | |
if len(sec) > 0: | |
cur=[] | |
s=[epub.Section(sec),cur] | |
root.append(s) | |
for (path, subtitle, date) in children: | |
c=chaps[path] | |
cur.append(c) | |
#cur.append(epub.Link(c.file_name, "{!s} ({!s})".format(subtitle, date), uid=c.id)) | |
book.toc = root | |
# add default NCX and Nav file | |
book.add_item(epub.EpubNcx()) | |
nav=epub.EpubNav() | |
nav.add_link(href='style.css', rel='stylesheet', type='text/css') | |
book.add_item(nav) | |
# define CSS style | |
css = epub.EpubItem(uid="style", file_name="style.css", media_type="text/css", content=open("style.css").read()) | |
if ntoc.require_font: | |
font=epub.EpubItem(uid="font", file_name="SourceHanSerif-Medium.otf", media_type="application/x-font-otf", content=open("SourceHanSerif-Medium.otf", "rb").read()) | |
book.add_item(font) | |
content = """\ | |
@font-face { | |
font-family: han; | |
src: url("SourceHanSerif-Medium.otf"); | |
} | |
""" + css.get_content().replace('font-family: serif;', 'font-family: han;') | |
css.set_content(content) | |
book.add_item(css) | |
# basic spine | |
book.spine = [intro, 'nav', *chaps.values()] | |
# write to the file | |
epub.write_epub(ntoc.epubname(dir='eb'), book, {}) | |
print(ntoc.epubname(dir='eb')) | |
if __name__ == '__main__': | |
if len(sys.argv) > 1: | |
ntoc = NTOC(sys.argv[-1]) | |
create_book(ntoc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment