flying-sheep · June 17, 2019 10:52 · raekuul · Mar 6, 2015
diff --git a/dkami.py b/dkami.py
 #!/usr/bin/env python3

 import re
 from contextlib import closing
 from epub import create_epub, DEFAULT_STYLESHEET

 import requests
 import requests_cache
 from bs4 import BeautifulSoup
 from smartypants import smartypants


 author = 'Pusakuronu'
 title = 'Dungeon Keeper Ami'
 publisher = 'Anime Addventure'

 url_base = 'https://forums.sufficientvelocity.com/threads/dungeon-keeper-ami-sailor-moon-dungeon-keeper-story-only-thread.30066'
 #list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+'))
 #story_url_re = re.compile(r'.*/(\d+).html')
 #header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title))

 titlepic_url = 'http://fc00.deviantart.net/fs70/f/2011/063/b/e/dungeon_keeper_ami_by_paulobarrios-d3avri8.jpg'

 stylesheet = '''\
 hr {
 margin: 0 20% 55px; padding: 19px 0; line-height: 38px;
 border: none; border-bottom: 1px solid black; text-align: center;
 color: inherit; background-color: inherit;
 }
 hr:before {
 content: "☿"; display: inline-block; float: left; position: relative; left: -8px;
 margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default;
 }
 '''


 def parse(url):
 	r = requests.get(url)
 	return BeautifulSoup(r.text, 'html5lib')

 def get_posts(pages):
 	for p in pages:
 		marked = p.find(class_='threadmarker')
 		if marked:
 			_, _, title = marked.find(class_='label').children
 			yield title.strip(), p.find(class_='messageText')


 requests_cache.install_cache('dkami')


 thread_pages = [parse(url_base)]
 nav = thread_pages[0].find(class_='PageNav')
 thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)]

 pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')]

 posts = list(get_posts(pages))

 chapters = []
 for id_, (header, html) in enumerate(posts):
 	html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}')
 	chapters.append((id_, header, html))

 titlepic = requests.get(titlepic_url).content

 create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet)
diff --git a/epub.py b/epub.py
 import sys

 from zipfile import ZipFile, ZIP_DEFLATED

 from bs4 import BeautifulSoup, Tag

 TITLEPIC_PATH = 'images/title.png'
 TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH)
 TITLEPAGE_ID = '0-titlepage'

 XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'

 DEFAULT_STYLESHEET = '''\
 img {
 max-width: 100%;
 max-height: 100%;
 }
 '''

 CONTAINER = XML_HEADER + '''
 <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
 <rootfiles>
  <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
 </rootfiles>
 </container>
 '''

 content = (XML_HEADER + '''
 <package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
 <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
  <dc:title>{title}</dc:title>
  <dc:creator opf:role="aut">{author}</dc:creator>
  <dc:language>en-US</dc:language>
  <dc:rights>Public Domain</dc:rights>
  <dc:publisher>{publisher}</dc:publisher>
  <dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier>
 </metadata>
 <manifest>
  <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
  <item id="style" href="stylesheet.css" media-type="text/css"/>
 {titlepic_item}
 {items}
 </manifest>
 <spine toc="ncx">
 {itemrefs}
 </spine>
 </package>
 ''').format

 item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format
 itemref = '\t\t<itemref idref="{id}"/>'.format

 toc = (XML_HEADER + '''
 <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
 <head>
 <meta name="dtb:uid" content="{uuid}"/>
 <meta name="dtb:depth" content="1"/>
 <meta name="dtb:totalPageCount" content="0"/>
 <meta name="dtb:maxPageNumber" content="0"/>
 </head>

 <docTitle>
 <text>{title}</text>
 </docTitle>

 <navMap>
 {navpoints}
 </navMap>
 </ncx>
 ''').format

 navpoint = '''\
 <navPoint id="{id}" playOrder="{order}">
  <navLabel>
   <text>{header}</text>
  </navLabel>
  <content src="{id}.xhtml"/>
 </navPoint>'''.format

 titlepage = '''\
 <html>
 <head>
  <title>{title}</title>
  <style type="text/css">
   @page {{ padding: 0; margin: 0 }}
   body {{ text-align: center; padding: 0; margin: 0 }}
  </style>
 </head>
 <body>
  {content}
 </body>
 </html>
 '''.format

 def xhtmlify(page):
 	if not page.find('body'):  # assume sequence of divs/paragraphs/…
 		page.wrap(page, page.new_tag('body'))
 	
 	html_tag = page.find('html')
 	if not html_tag:  # assume plain body tag
 		html_tag = page.wrap(page, page.new_tag('html'))
 	
 	if not page.find('head'):
 		head_tag = page.new_tag('head')
 		html_tag.insert(0, head_tag)
 		h1 = page.find('h1')
 		if h1:
 			title_tag = page.new_tag('title')
 			title_tag.string = h1.get_text()
 			head_tag.insert(0, title_tag)
 	
 	html_tag['xmlns'] = 'http://www.w3.org/1999/xhtml'
 	html_tag['xml:lang'] = 'en'
 	page.is_xml = True  # emit xml header

 def create_parts(title_page, chapters):
 	"""yields toc and index entries, as well as chapter tuples with prepended title page"""
 	yield (
 		item(id=TITLEPAGE_ID),
 		itemref(id=TITLEPAGE_ID),
 		navpoint(id=TITLEPAGE_ID, header='Title page', order=1),
 		(TITLEPAGE_ID, 'Title page', title_page))
 		
 	for order, chapter in enumerate(chapters, 2):
 		id_, header, _ = chapter
 		yield (
 			item(id=id_),
 			itemref(id=id_),
 			navpoint(id=id_, header=header, order=order),
 			chapter)

 def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET):
 	"""Creates and saves an epub file.
 	chapters: sequence of (id, title, page) tuples. The ids are used as filenames.
 		page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document.
 	path: path to write to.
 	titlepic: path to or bytes of png file.
 	"""
 	if path is None:
 		path = '{} – {}.epub'.format(author, title)
 	
 	if uuid is None:
 		uuid = '{}-{}'.format(author, title).replace(' ', '_').lower()
 	
 	if titlepic is None:
 		titlepic_item = ''
 		title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher))
 	else:
 		titlepic_item = TITLEPIC_ITEM
 		if isinstance(titlepic, str):
 			with open(titlepic, 'rb'):
 				titlepic = titlepic.read()
 		title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH))
 	
 	items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters))
 	
 	with ZipFile(path, 'w', ZIP_DEFLATED) as epub:
 		epub.writestr('mimetype', 'application/epub+zip')
 		epub.writestr('META-INF/container.xml', CONTAINER)
 		
 		if titlepic:
 			epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic)
 		epub.writestr('OEBPS/stylesheet.css', stylesheet)
 		epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid,
 			titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs)))
 		epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints)))
 		
 		for id_, _, chapter in chapters:
 			if isinstance(chapter, Tag):
 				t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib')
 				chapter.append(t)
 			elif not isinstance(chapter, BeautifulSoup):
 				chapter = BeautifulSoup(chapter, 'html5lib')
 			
 			xhtmlify(chapter)
 			
 			chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet'))
 			
 			# enforce XHTML
 			epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify())
	#!/usr/bin/env python3

	import re
	from contextlib import closing
	from epub import create_epub, DEFAULT_STYLESHEET

	import requests
	import requests_cache
	from bs4 import BeautifulSoup
	from smartypants import smartypants


	author = 'Pusakuronu'
	title = 'Dungeon Keeper Ami'
	publisher = 'Anime Addventure'

	url_base = 'https://forums.sufficientvelocity.com/threads/dungeon-keeper-ami-sailor-moon-dungeon-keeper-story-only-thread.30066'
	#list_url = url_base + 'authors/{author}.html?tag={title}'.format(author=author, title=title.replace(' ', '+'))
	#story_url_re = re.compile(r'.*/(\d+).html')
	#header_re = re.compile(r'{}: (.+) \[Episode \d+\]'.format(title))

	titlepic_url = 'http://fc00.deviantart.net/fs70/f/2011/063/b/e/dungeon_keeper_ami_by_paulobarrios-d3avri8.jpg'

	stylesheet = '''\
	hr {
	margin: 0 20% 55px; padding: 19px 0; line-height: 38px;
	border: none; border-bottom: 1px solid black; text-align: center;
	color: inherit; background-color: inherit;
	}
	hr:before {
	content: "☿"; display: inline-block; float: left; position: relative; left: -8px;
	margin: 0 50%; padding: 0 4px; background-color: inherit; cursor: default;
	}
	'''


	def parse(url):
	r = requests.get(url)
	return BeautifulSoup(r.text, 'html5lib')

	def get_posts(pages):
	for p in pages:
	marked = p.find(class_='threadmarker')
	if marked:
	_, _, title = marked.find(class_='label').children
	yield title.strip(), p.find(class_='messageText')


	requests_cache.install_cache('dkami')


	thread_pages = [parse(url_base)]
	nav = thread_pages[0].find(class_='PageNav')
	thread_pages += [parse(f'{url_base}/page-{p}') for p in range(int(nav['data-start']), int(nav['data-last'])+1)]

	pages = [msg for tp in thread_pages for msg in tp.find_all('li', class_='message')]

	posts = list(get_posts(pages))

	chapters = []
	for id_, (header, html) in enumerate(posts):
	html = smartypants(f'<h1>{header}</h1>\n{html.prettify()}')
	chapters.append((id_, header, html))

	titlepic = requests.get(titlepic_url).content

	create_epub(title, author, publisher, chapters, titlepic=titlepic, stylesheet=DEFAULT_STYLESHEET + stylesheet)
	import sys

	from zipfile import ZipFile, ZIP_DEFLATED

	from bs4 import BeautifulSoup, Tag

	TITLEPIC_PATH = 'images/title.png'
	TITLEPIC_ITEM = '\t\t<item id="imgl" href="{}" media-type="image/png"/>'.format(TITLEPIC_PATH)
	TITLEPAGE_ID = '0-titlepage'

	XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'

	DEFAULT_STYLESHEET = '''\
	img {
	max-width: 100%;
	max-height: 100%;
	}
	'''

	CONTAINER = XML_HEADER + '''
	<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
	<rootfiles>
	<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
	</rootfiles>
	</container>
	'''

	content = (XML_HEADER + '''
	<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookID" version="2.0">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
	<dc:title>{title}</dc:title>
	<dc:creator opf:role="aut">{author}</dc:creator>
	<dc:language>en-US</dc:language>
	<dc:rights>Public Domain</dc:rights>
	<dc:publisher>{publisher}</dc:publisher>
	<dc:identifier id="BookID" opf:scheme="UUID">{uuid}</dc:identifier>
	</metadata>
	<manifest>
	<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
	<item id="style" href="stylesheet.css" media-type="text/css"/>
	{titlepic_item}
	{items}
	</manifest>
	<spine toc="ncx">
	{itemrefs}
	</spine>
	</package>
	''').format

	item = '\t\t<item id="{id}" href="{id}.xhtml" media-type="application/xhtml+xml"/>'.format
	itemref = '\t\t<itemref idref="{id}"/>'.format

	toc = (XML_HEADER + '''
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<head>
	<meta name="dtb:uid" content="{uuid}"/>
	<meta name="dtb:depth" content="1"/>
	<meta name="dtb:totalPageCount" content="0"/>
	<meta name="dtb:maxPageNumber" content="0"/>
	</head>

	<docTitle>
	<text>{title}</text>
	</docTitle>

	<navMap>
	{navpoints}
	</navMap>
	</ncx>
	''').format

	navpoint = '''\
	<navPoint id="{id}" playOrder="{order}">
	<navLabel>
	<text>{header}</text>
	</navLabel>
	<content src="{id}.xhtml"/>
	</navPoint>'''.format

	titlepage = '''\
	<html>
	<head>
	<title>{title}</title>
	<style type="text/css">
	@page {{ padding: 0; margin: 0 }}
	body {{ text-align: center; padding: 0; margin: 0 }}
	</style>
	</head>
	<body>
	{content}
	</body>
	</html>
	'''.format

	def xhtmlify(page):
	if not page.find('body'): # assume sequence of divs/paragraphs/…
	page.wrap(page, page.new_tag('body'))

	html_tag = page.find('html')
	if not html_tag: # assume plain body tag
	html_tag = page.wrap(page, page.new_tag('html'))

	if not page.find('head'):
	head_tag = page.new_tag('head')
	html_tag.insert(0, head_tag)
	h1 = page.find('h1')
	if h1:
	title_tag = page.new_tag('title')
	title_tag.string = h1.get_text()
	head_tag.insert(0, title_tag)

	html_tag['xmlns'] = 'http://www.w3.org/1999/xhtml'
	html_tag['xml:lang'] = 'en'
	page.is_xml = True # emit xml header

	def create_parts(title_page, chapters):
	"""yields toc and index entries, as well as chapter tuples with prepended title page"""
	yield (
	item(id=TITLEPAGE_ID),
	itemref(id=TITLEPAGE_ID),
	navpoint(id=TITLEPAGE_ID, header='Title page', order=1),
	(TITLEPAGE_ID, 'Title page', title_page))

	for order, chapter in enumerate(chapters, 2):
	id_, header, _ = chapter
	yield (
	item(id=id_),
	itemref(id=id_),
	navpoint(id=id_, header=header, order=order),
	chapter)

	def create_epub(title, author, publisher, chapters, path=None, *, uuid=None, titlepic=None, stylesheet=DEFAULT_STYLESHEET):
	"""Creates and saves an epub file.
	chapters: sequence of (id, title, page) tuples. The ids are used as filenames.
	page can be a string or BeautifulSoup. it may be a body tag, a series of content tags, or a whole (X)HTML document.
	path: path to write to.
	titlepic: path to or bytes of png file.
	"""
	if path is None:
	path = '{} – {}.epub'.format(author, title)

	if uuid is None:
	uuid = '{}-{}'.format(author, title).replace(' ', '_').lower()

	if titlepic is None:
	titlepic_item = ''
	title_page = titlepage(title=title, content='<h1>{}</h1>\n<h2>{}</h2>\n<h3>{}</h3>'.format(title, author, publisher))
	else:
	titlepic_item = TITLEPIC_ITEM
	if isinstance(titlepic, str):
	with open(titlepic, 'rb'):
	titlepic = titlepic.read()
	title_page = titlepage(title=title, content='<img src="{}"/>'.format(TITLEPIC_PATH))

	items, itemrefs, navpoints, chapters = zip(*create_parts(title_page, chapters))

	with ZipFile(path, 'w', ZIP_DEFLATED) as epub:
	epub.writestr('mimetype', 'application/epub+zip')
	epub.writestr('META-INF/container.xml', CONTAINER)

	if titlepic:
	epub.writestr('OEBPS/' + TITLEPIC_PATH, titlepic)
	epub.writestr('OEBPS/stylesheet.css', stylesheet)
	epub.writestr('OEBPS/content.opf', content(title=title, author=author, publisher=publisher, uuid=uuid,
	titlepic_item=titlepic_item, items='\n'.join(items), itemrefs='\n'.join(itemrefs)))
	epub.writestr('OEBPS/toc.ncx', toc(title=title, uuid=uuid, navpoints='\n'.join(navpoints)))

	for id_, _, chapter in chapters:
	if isinstance(chapter, Tag):
	t, chapter = chapter, BeautifulSoup('<!doctype html><meta charset=utf-8>', 'html5lib')
	chapter.append(t)
	elif not isinstance(chapter, BeautifulSoup):
	chapter = BeautifulSoup(chapter, 'html5lib')

	xhtmlify(chapter)

	chapter.find('head').append(chapter.new_tag('link', href='stylesheet.css', type='text/css', rel='stylesheet'))

	# enforce XHTML
	epub.writestr('OEBPS/{}.xhtml'.format(id_), chapter.prettify())