Created
May 15, 2011 14:01
-
-
Save jeremyBanks/973183 to your computer and use it in GitHub Desktop.
FanFiction.net Audiobook Generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
import urllib2 | |
import subprocess | |
import lxml.cssselect | |
import lxml.etree | |
import mutagen | |
selector = lxml.cssselect.CSSSelector | |
def get_document(url): | |
"""Retrieves a URL and parses the response as an HTML document.""" | |
data = urllib2.urlopen(url).read() | |
return lxml.etree.fromstring(data.decode("ascii", "ignore"), lxml.etree.HTMLParser(encoding="UTF-8")) | |
def get_chapters(story_id): | |
print "Retrieving chapter index..." | |
document = get_document("http://www.fanfiction.net/s/{}/{}/" | |
.format(story_id, 1)) | |
chapter_selector = selector("select[name*=chapter]")(document)[0] | |
for label in lxml.etree.ElementTextIterator(chapter_selector): | |
n, _, title = label.partition(". ") | |
yield (int(n), title) | |
def get_chapter_text(story_id, chapter): | |
print "Retrieving text of chapter {}/{}".format(story_id, chapter) | |
document = get_document("http://www.fanfiction.net/s/{}/{}/" | |
.format(story_id, chapter)) | |
text_element = selector("#storytext")(document)[0] | |
lxml.etree.strip_tags(text_element, "i", "b", "a") | |
return "\n\n".join(lxml.etree.ElementTextIterator(text_element)) | |
def get_text_by_chapter(story_id, chapters): | |
"""Yields (number, title, body) for each chapter in a story.""" | |
for n, title in chapters: | |
yield n, title, "Chapter {}: {}\n\n{}".format(n, title, get_chapter_text(story_id, n)) | |
def say(s, *a): | |
"""Executes the say command with the specified data and arguments.""" | |
subprocess.Popen(["say"] + list(a), stdin=subprocess.PIPE).communicate(s) | |
import mutagen.m4a | |
def dump_story(story_id, story_title="", story_author="", cover=None): | |
"""Generates a sequence of m4a files for a story.""" | |
chapters = list(get_chapters(story_id)) | |
methods_text = get_text_by_chapter(story_id, chapters) | |
for n, title, text in methods_text: | |
if n == 1: | |
if story_author: | |
text = "by " + story_author + "\n\n" + text | |
if story_title: | |
text = story_title + "\n\n" + text | |
filename = "{}-{:03d}-{}.m4a".format(story_title or story_id, n, title) | |
print "Writing", filename | |
say(text, "-o", filename) | |
print "Writing meta info" | |
info = mutagen.m4a.M4A(filename) | |
info["trkn"] = (n, len(chapters)) | |
info["\xa9nam"] = "Chapter {}: {}".format(n, title) | |
if story_title: | |
info["\xa9alb"] = story_title | |
if story_author: | |
info["\xa9ART"] = story_author | |
if cover: | |
info["covr"] = cover | |
info["\xa9cmt"] = "Generated using Mac OS X 10.6's Speech Synthesis by a script available at https://gist.github.com/973183" | |
info.save() | |
# what I'm doing | |
cover = mutagen.m4a.M4ACover(open("cover.jpg", "rb").read(), mutagen.m4a.M4ACover.FORMAT_JPEG) | |
dump_story(5782108, "Harry Potter and the Methods of Rationality", "Eliezer Yudkowsky", cover) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment