Skip to content

Instantly share code, notes, and snippets.

@nelhage
Created December 23, 2013 06:35
Show Gist options
  • Save nelhage/8092586 to your computer and use it in GitHub Desktop.
Save nelhage/8092586 to your computer and use it in GitHub Desktop.
from lxml import etree
import os.path
import os
import sys
import re
import urllib2
import urlparse
import logging
class Chapter(object):
def __init__(self, title, slug):
self.title = title
self.slug = slug
class Generator(object):
TOPDIR = os.path.join(os.path.abspath(os.path.dirname(__file__)))
ENTRY = 'http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/'
def download(self, url):
parsed = urlparse.urlparse(url)
disk = os.path.join(self.TOPDIR, parsed.netloc, parsed.path[1:])
if disk.endswith("/"):
disk = os.path.join(disk, 'index.html')
if os.path.exists(disk):
logging.debug("Using cached %s", disk)
else:
try:
os.makedirs(os.path.dirname(disk))
except OSError, e:
pass
logging.debug("Downloading from %s to cached %s...", url, disk)
data = urllib2.urlopen(url).read()
with open(disk, 'w') as f:
f.write(data)
return open(disk)
def __init__(self):
pass
def parse(self, data):
parser = etree.HTMLParser(encoding='utf-8')
return etree.parse(data, parser)
def addPage(self, url, x):
title = etree.tostring(x.xpath('//h1[@class="entry-title"]')[0],
method='text',
encoding='UTF-8').decode('UTF-8').strip()
slug = re.sub(r'[^a-zA-Z0-9]', '-', urlparse.urlparse(url).path[1:-1]) + ".html"
content = x.xpath('//div[@class="entry-content"]')[0]
# Strip nav links
for a in content.xpath("//a[contains(text(), 'Chapter')]"):
a.getparent().remove(a)
share = content.xpath('//div[@id="jp-post-flair"]')[0]
share.getparent().remove(share)
with open(os.path.join('out', slug), 'w') as fh:
fh.write("<html><head></head><body>\n")
fh.write("<h1>%s</h1>\n" % (title.encode("utf-8"),))
fh.write(etree.tostring(content))
fh.write("\n</body></html>")
self.chapters.append(Chapter(title, slug))
def node(self, tag, text=None):
e = etree.Element(tag)
e.text = text
return e
def writeIndex(self):
h = etree.HTML("<div>")
head = self.node('head')
head.append(self.node('title', "Worm"))
h.insert(0, head)
div = h.xpath('//div')[0]
div.append(self.node('h1', "Table of Contents"))
li = self.node('li')
div.append(li)
for ch in self.chapters:
a = self.node('a', ch.title)
a.attrib['href'] = ch.slug
ul = self.node('ul')
ul.append(a)
li.append(ul)
with open('out/index.html', 'w') as fh:
fh.write(etree.tostring(h))
def run(self):
self.chapters = []
page = self.ENTRY
while True:
x = self.parse(self.download(page))
a = x.xpath("//a[contains(text(),'Next Chapter')]")
self.addPage(page, x)
if not a:
break
page = a[0].attrib['href'].encode('utf-8')
self.writeIndex()
logging.basicConfig(level='DEBUG')
Generator().run()
# ebook-convert out/index.html parahumans.mobi --max-toc-links=500
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment