jeremyBanks · May 15, 2011 14:01
diff --git a/cover.jpg b/cover.jpg
diff --git a/ffn_audiobook_generator.py b/ffn_audiobook_generator.py
 #!/usr/bin/env python2.7
 import urllib2
 import subprocess

 import lxml.cssselect
 import lxml.etree

 import mutagen

 selector = lxml.cssselect.CSSSelector

 def get_document(url):
    """Retrieves a URL and parses the response as an HTML document."""
    
    data = urllib2.urlopen(url).read()
    
    return lxml.etree.fromstring(data.decode("ascii", "ignore"), lxml.etree.HTMLParser(encoding="UTF-8"))

 def get_chapters(story_id):
    print "Retrieving chapter index..."
    
    document = get_document("http://www.fanfiction.net/s/{}/{}/"
                            .format(story_id, 1))
    chapter_selector = selector("select[name*=chapter]")(document)[0]
    
    for label in lxml.etree.ElementTextIterator(chapter_selector):
        n, _, title = label.partition(". ")
        yield (int(n), title)

 def get_chapter_text(story_id, chapter):
    print "Retrieving text of chapter {}/{}".format(story_id, chapter)
    
    document = get_document("http://www.fanfiction.net/s/{}/{}/"
                            .format(story_id, chapter))
    
    text_element = selector("#storytext")(document)[0]
    lxml.etree.strip_tags(text_element, "i", "b", "a")
    return "\n\n".join(lxml.etree.ElementTextIterator(text_element))

 def get_text_by_chapter(story_id, chapters):
    """Yields (number, title, body) for each chapter in a story."""
    
    for n, title in chapters:
        yield n, title, "Chapter {}: {}\n\n{}".format(n, title, get_chapter_text(story_id, n))

 def say(s, *a):
    """Executes the say command with the specified data and arguments."""
    
    subprocess.Popen(["say"] + list(a), stdin=subprocess.PIPE).communicate(s)

 import mutagen.m4a

 def dump_story(story_id, story_title="", story_author="", cover=None):
    """Generates a sequence of m4a files for a story."""
    
    chapters = list(get_chapters(story_id))
    methods_text = get_text_by_chapter(story_id, chapters)
    
    for n, title, text in methods_text:
        if n == 1:
            if story_author:
                text = "by " + story_author + "\n\n" + text
            
            if story_title:
                text = story_title + "\n\n" + text
        
        filename = "{}-{:03d}-{}.m4a".format(story_title or story_id, n, title)
        
        print "Writing", filename
        say(text, "-o", filename)
        print "Writing meta info"
        
        info = mutagen.m4a.M4A(filename)
        info["trkn"] = (n, len(chapters))
        
        info["\xa9nam"] = "Chapter {}: {}".format(n, title)
        
        if story_title:
            info["\xa9alb"] = story_title
        
        if story_author:
            info["\xa9ART"] = story_author
        
        if cover:
            info["covr"] = cover
        
        info["\xa9cmt"] = "Generated using Mac OS X 10.6's Speech Synthesis by a script available at https://gist.github.com/973183"
        
        info.save()

 # what I'm doing
 cover = mutagen.m4a.M4ACover(open("cover.jpg", "rb").read(), mutagen.m4a.M4ACover.FORMAT_JPEG)
 dump_story(5782108, "Harry Potter and the Methods of Rationality", "Eliezer Yudkowsky", cover)
	#!/usr/bin/env python2.7
	import urllib2
	import subprocess

	import lxml.cssselect
	import lxml.etree

	import mutagen

	selector = lxml.cssselect.CSSSelector

	def get_document(url):
	"""Retrieves a URL and parses the response as an HTML document."""

	data = urllib2.urlopen(url).read()

	return lxml.etree.fromstring(data.decode("ascii", "ignore"), lxml.etree.HTMLParser(encoding="UTF-8"))

	def get_chapters(story_id):
	print "Retrieving chapter index..."

	document = get_document("http://www.fanfiction.net/s/{}/{}/"
	.format(story_id, 1))
	chapter_selector = selector("select[name*=chapter]")(document)[0]

	for label in lxml.etree.ElementTextIterator(chapter_selector):
	n, _, title = label.partition(". ")
	yield (int(n), title)

	def get_chapter_text(story_id, chapter):
	print "Retrieving text of chapter {}/{}".format(story_id, chapter)

	document = get_document("http://www.fanfiction.net/s/{}/{}/"
	.format(story_id, chapter))

	text_element = selector("#storytext")(document)[0]
	lxml.etree.strip_tags(text_element, "i", "b", "a")
	return "\n\n".join(lxml.etree.ElementTextIterator(text_element))

	def get_text_by_chapter(story_id, chapters):
	"""Yields (number, title, body) for each chapter in a story."""

	for n, title in chapters:
	yield n, title, "Chapter {}: {}\n\n{}".format(n, title, get_chapter_text(story_id, n))

	def say(s, *a):
	"""Executes the say command with the specified data and arguments."""

	subprocess.Popen(["say"] + list(a), stdin=subprocess.PIPE).communicate(s)

	import mutagen.m4a

	def dump_story(story_id, story_title="", story_author="", cover=None):
	"""Generates a sequence of m4a files for a story."""

	chapters = list(get_chapters(story_id))
	methods_text = get_text_by_chapter(story_id, chapters)

	for n, title, text in methods_text:
	if n == 1:
	if story_author:
	text = "by " + story_author + "\n\n" + text

	if story_title:
	text = story_title + "\n\n" + text

	filename = "{}-{:03d}-{}.m4a".format(story_title or story_id, n, title)

	print "Writing", filename
	say(text, "-o", filename)
	print "Writing meta info"

	info = mutagen.m4a.M4A(filename)
	info["trkn"] = (n, len(chapters))

	info["\xa9nam"] = "Chapter {}: {}".format(n, title)

	if story_title:
	info["\xa9alb"] = story_title

	if story_author:
	info["\xa9ART"] = story_author

	if cover:
	info["covr"] = cover

	info["\xa9cmt"] = "Generated using Mac OS X 10.6's Speech Synthesis by a script available at https://gist.github.com/973183"

	info.save()

	# what I'm doing
	cover = mutagen.m4a.M4ACover(open("cover.jpg", "rb").read(), mutagen.m4a.M4ACover.FORMAT_JPEG)
	dump_story(5782108, "Harry Potter and the Methods of Rationality", "Eliezer Yudkowsky", cover)