Last active
June 17, 2019 10:52
-
-
Save flying-sheep/7958790 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from contextlib import closing | |
from epub import create_epub, DEFAULT_STYLESHEET | |
import requests | |
import requests_cache | |
from bs4 import BeautifulSoup | |
from smartypants import smartypants | |
author = 'Pusakuronu' | |
title = 'Dungeon Keeper Ami' | |
publisher = 'Anime Addventure' | |
url_base = 'https://forums.sufficientvelocity.com/threads/dungeon-keeper-ami-sailor-moon-dungeon-keeper-story-only-thread.30066' | |
#list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+')) | |
#story_url_re = re.compile(r'.*/(\d+).html') | |
#header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title)) | |
titlepic_url = 'http://fc00.deviantart.net/fs70/f/2011/063/b/e/dungeon_keeper_ami_by_paulobarrios-d3avri8.jpg' | |
stylesheet = '''\ | |
hr { | |
margin: 0 20% 55px; padding: 19px 0; line-height: 38px; | |
border: none; border-bottom: 1px solid black; text-align: center; | |
color: inherit; background-color: inherit; | |
} | |
hr:before { | |
content: "☿"; display: inline-block; float: left; position: relative; left: -8px; | |
margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default; | |
} | |
''' | |
def parse(url): | |
r = requests.get(url) | |
return BeautifulSoup(r.text, 'html5lib') | |
def get_posts(pages): | |
for p in pages: | |
marked = p.find(class_='threadmarker') | |
if marked: | |
_, _, title = marked.find(class_='label').children | |
yield title.strip(), p.find(class_='messageText') | |
requests_cache.install_cache('dkami') | |
thread_pages = [parse(url_base)] | |
nav = thread_pages[0].find(class_='PageNav') | |
thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)] | |
pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')] | |
posts = list(get_posts(pages)) | |
chapters = [] | |
for id_, (header, html) in enumerate(posts): | |
html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}') | |
chapters.append((id_, header, html)) | |
titlepic = requests.get(titlepic_url).content | |
create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from zipfile import ZipFile, ZIP_DEFLATED | |
from bs4 import BeautifulSoup, Tag | |
TITLEPIC_PATH = 'images/title.png' | |
TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH) | |
TITLEPAGE_ID = '0-titlepage' | |
XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>' | |
DEFAULT_STYLESHEET = '''\ | |
img { | |
max-width: 100%; | |
max-height: 100%; | |
} | |
''' | |
CONTAINER = XML_HEADER + ''' | |
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> | |
<rootfiles> | |
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/> | |
</rootfiles> | |
</container> | |
''' | |
content = (XML_HEADER + ''' | |
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0"> | |
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> | |
<dc:title>{title}</dc:title> | |
<dc:creator opf:role="aut">{author}</dc:creator> | |
<dc:language>en-US</dc:language> | |
<dc:rights>Public Domain</dc:rights> | |
<dc:publisher>{publisher}</dc:publisher> | |
<dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier> | |
</metadata> | |
<manifest> | |
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/> | |
<item id="style" href="stylesheet.css" media-type="text/css"/> | |
{titlepic_item} | |
{items} | |
</manifest> | |
<spine toc="ncx"> | |
{itemrefs} | |
</spine> | |
</package> | |
''').format | |
item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format | |
itemref = '\t\t<itemref idref="{id}"/>'.format | |
toc = (XML_HEADER + ''' | |
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> | |
<head> | |
<meta name="dtb:uid" content="{uuid}"/> | |
<meta name="dtb:depth" content="1"/> | |
<meta name="dtb:totalPageCount" content="0"/> | |
<meta name="dtb:maxPageNumber" content="0"/> | |
</head> | |
<docTitle> | |
<text>{title}</text> | |
</docTitle> | |
<navMap> | |
{navpoints} | |
</navMap> | |
</ncx> | |
''').format | |
navpoint = '''\ | |
<navPoint id="{id}" playOrder="{order}"> | |
<navLabel> | |
<text>{header}</text> | |
</navLabel> | |
<content src="{id}.xhtml"/> | |
</navPoint>'''.format | |
titlepage = '''\ | |
<html> | |
<head> | |
<title>{title}</title> | |
<style type="text/css"> | |
@page {{ padding: 0; margin: 0 }} | |
body {{ text-align: center; padding: 0; margin: 0 }} | |
</style> | |
</head> | |
<body> | |
{content} | |
</body> | |
</html> | |
'''.format | |
def xhtmlify(page): | |
if not page.find('body'): # assume sequence of divs/paragraphs/… | |
page.wrap(page, page.new_tag('body')) | |
html_tag = page.find('html') | |
if not html_tag: # assume plain body tag | |
html_tag = page.wrap(page, page.new_tag('html')) | |
if not page.find('head'): | |
head_tag = page.new_tag('head') | |
html_tag.insert(0, head_tag) | |
h1 = page.find('h1') | |
if h1: | |
title_tag = page.new_tag('title') | |
title_tag.string = h1.get_text() | |
head_tag.insert(0, title_tag) | |
html_tag['xmlns'] = 'http://www.w3.org/1999/xhtml' | |
html_tag['xml:lang'] = 'en' | |
page.is_xml = True # emit xml header | |
def create_parts(title_page, chapters): | |
"""yields toc and index entries, as well as chapter tuples with prepended title page""" | |
yield ( | |
item(id=TITLEPAGE_ID), | |
itemref(id=TITLEPAGE_ID), | |
navpoint(id=TITLEPAGE_ID, header='Title page', order=1), | |
(TITLEPAGE_ID, 'Title page', title_page)) | |
for order, chapter in enumerate(chapters, 2): | |
id_, header, _ = chapter | |
yield ( | |
item(id=id_), | |
itemref(id=id_), | |
navpoint(id=id_, header=header, order=order), | |
chapter) | |
def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET): | |
"""Creates and saves an epub file. | |
chapters: sequence of (id, title, page) tuples. The ids are used as filenames. | |
page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document. | |
path: path to write to. | |
titlepic: path to or bytes of png file. | |
""" | |
if path is None: | |
path = '{} – {}.epub'.format(author, title) | |
if uuid is None: | |
uuid = '{}-{}'.format(author, title).replace(' ', '_').lower() | |
if titlepic is None: | |
titlepic_item = '' | |
title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher)) | |
else: | |
titlepic_item = TITLEPIC_ITEM | |
if isinstance(titlepic, str): | |
with open(titlepic, 'rb'): | |
titlepic = titlepic.read() | |
title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH)) | |
items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters)) | |
with ZipFile(path, 'w', ZIP_DEFLATED) as epub: | |
epub.writestr('mimetype', 'application/epub+zip') | |
epub.writestr('META-INF/container.xml', CONTAINER) | |
if titlepic: | |
epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic) | |
epub.writestr('OEBPS/stylesheet.css', stylesheet) | |
epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid, | |
titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs))) | |
epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints))) | |
for id_, _, chapter in chapters: | |
if isinstance(chapter, Tag): | |
t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib') | |
chapter.append(t) | |
elif not isinstance(chapter, BeautifulSoup): | |
chapter = BeautifulSoup(chapter, 'html5lib') | |
xhtmlify(chapter) | |
chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet')) | |
# enforce XHTML | |
epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The script is leaving out chunks of chapters and isn't grabbing chapters after Frenzied Fortification