nelhage · December 23, 2013 06:35
diff --git a/generate.py b/generate.py
 from lxml import etree
 import os.path
 import os
 import sys
 import re
 import urllib2
 import urlparse
 import logging

 class Chapter(object):
    def __init__(self, title, slug):
        self.title = title
        self.slug = slug

 class Generator(object):
    TOPDIR = os.path.join(os.path.abspath(os.path.dirname(__file__)))
    ENTRY = 'http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/'

    def download(self, url):
        parsed = urlparse.urlparse(url)
        disk = os.path.join(self.TOPDIR, parsed.netloc, parsed.path[1:])
        if disk.endswith("/"):
            disk = os.path.join(disk, 'index.html')
        if os.path.exists(disk):
            logging.debug("Using cached %s", disk)
        else:
            try:
                os.makedirs(os.path.dirname(disk))
            except OSError, e:
                pass
            logging.debug("Downloading from %s to cached %s...", url, disk)
            data = urllib2.urlopen(url).read()
            with open(disk, 'w') as f:
                f.write(data)
        return open(disk)

    def __init__(self):
        pass

    def parse(self, data):
        parser = etree.HTMLParser(encoding='utf-8')
        return etree.parse(data, parser)

    def addPage(self, url, x):
        title = etree.tostring(x.xpath('//h1[@class="entry-title"]')[0],
                               method='text',
                               encoding='UTF-8').decode('UTF-8').strip()
        slug = re.sub(r'[^a-zA-Z0-9]', '-', urlparse.urlparse(url).path[1:-1]) + ".html"
        content = x.xpath('//div[@class="entry-content"]')[0]
        # Strip nav links
        for a in content.xpath("//a[contains(text(), 'Chapter')]"):
            a.getparent().remove(a)
        share = content.xpath('//div[@id="jp-post-flair"]')[0]
        share.getparent().remove(share)
        with open(os.path.join('out', slug), 'w') as fh:
            fh.write("<html><head></head><body>\n")
            fh.write("<h1>%s</h1>\n" % (title.encode("utf-8"),))
            fh.write(etree.tostring(content))
            fh.write("\n</body></html>")

        self.chapters.append(Chapter(title, slug))

    def node(self, tag, text=None):
        e = etree.Element(tag)
        e.text = text
        return e

    def writeIndex(self):
        h = etree.HTML("<div>")
        head = self.node('head')
        head.append(self.node('title', "Worm"))
        h.insert(0, head)
        div = h.xpath('//div')[0]
        div.append(self.node('h1', "Table of Contents"))
        li = self.node('li')
        div.append(li)
        for ch in self.chapters:
            a = self.node('a', ch.title)
            a.attrib['href'] = ch.slug
            ul = self.node('ul')
            ul.append(a)
            li.append(ul)
        with open('out/index.html', 'w') as fh:
            fh.write(etree.tostring(h))

    def run(self):
        self.chapters = []
        page = self.ENTRY
        while True:
            x = self.parse(self.download(page))
            a = x.xpath("//a[contains(text(),'Next Chapter')]")
            self.addPage(page, x)
            if not a:
                break
            page = a[0].attrib['href'].encode('utf-8')
        self.writeIndex()

 logging.basicConfig(level='DEBUG')
 Generator().run()

 # ebook-convert out/index.html parahumans.mobi --max-toc-links=500
	from lxml import etree
	import os.path
	import os
	import sys
	import re
	import urllib2
	import urlparse
	import logging

	class Chapter(object):
	def __init__(self, title, slug):
	self.title = title
	self.slug = slug

	class Generator(object):
	TOPDIR = os.path.join(os.path.abspath(os.path.dirname(__file__)))
	ENTRY = 'http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/'

	def download(self, url):
	parsed = urlparse.urlparse(url)
	disk = os.path.join(self.TOPDIR, parsed.netloc, parsed.path[1:])
	if disk.endswith("/"):
	disk = os.path.join(disk, 'index.html')
	if os.path.exists(disk):
	logging.debug("Using cached %s", disk)
	else:
	try:
	os.makedirs(os.path.dirname(disk))
	except OSError, e:
	pass
	logging.debug("Downloading from %s to cached %s...", url, disk)
	data = urllib2.urlopen(url).read()
	with open(disk, 'w') as f:
	f.write(data)
	return open(disk)

	def __init__(self):
	pass

	def parse(self, data):
	parser = etree.HTMLParser(encoding='utf-8')
	return etree.parse(data, parser)

	def addPage(self, url, x):
	title = etree.tostring(x.xpath('//h1[@class="entry-title"]')[0],
	method='text',
	encoding='UTF-8').decode('UTF-8').strip()
	slug = re.sub(r'[^a-zA-Z0-9]', '-', urlparse.urlparse(url).path[1:-1]) + ".html"
	content = x.xpath('//div[@class="entry-content"]')[0]
	# Strip nav links
	for a in content.xpath("//a[contains(text(), 'Chapter')]"):
	a.getparent().remove(a)
	share = content.xpath('//div[@id="jp-post-flair"]')[0]
	share.getparent().remove(share)
	with open(os.path.join('out', slug), 'w') as fh:
	fh.write("<html><head></head><body>\n")
	fh.write("<h1>%s</h1>\n" % (title.encode("utf-8"),))
	fh.write(etree.tostring(content))
	fh.write("\n</body></html>")

	self.chapters.append(Chapter(title, slug))

	def node(self, tag, text=None):
	e = etree.Element(tag)
	e.text = text
	return e

	def writeIndex(self):
	h = etree.HTML("<div>")
	head = self.node('head')
	head.append(self.node('title', "Worm"))
	h.insert(0, head)
	div = h.xpath('//div')[0]
	div.append(self.node('h1', "Table of Contents"))
	li = self.node('li')
	div.append(li)
	for ch in self.chapters:
	a = self.node('a', ch.title)
	a.attrib['href'] = ch.slug
	ul = self.node('ul')
	ul.append(a)
	li.append(ul)
	with open('out/index.html', 'w') as fh:
	fh.write(etree.tostring(h))

	def run(self):
	self.chapters = []
	page = self.ENTRY
	while True:
	x = self.parse(self.download(page))
	a = x.xpath("//a[contains(text(),'Next Chapter')]")
	self.addPage(page, x)
	if not a:
	break
	page = a[0].attrib['href'].encode('utf-8')
	self.writeIndex()

	logging.basicConfig(level='DEBUG')
	Generator().run()

	# ebook-convert out/index.html parahumans.mobi --max-toc-links=500