Created
December 1, 2011 19:24
-
-
Save EntityReborn/1419167 to your computer and use it in GitHub Desktop.
Downloader for STaD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2, os, urlparse, re | |
from BeautifulSoup import BeautifulSoup | |
# User variables | |
DESTFOLDER = "STAD" | |
OVERWRITEHTML = True | |
OVERWRITEPICS = False | |
LASTCHAPTER = 81 | |
# Shouldn't need to modify these constants unless downloading another story | |
BASEURL = "http://six.clubetchi.com/6times/" | |
FILEPATTERN = "std-part%02d.htm" | |
EXTRAFILES = [ | |
"CoveredByYourGrace.ttf", | |
"DancingScript.ttf", | |
"GiveYouGlory.ttf", | |
"OvertheRainbow.ttf", | |
"CoveredByYourGrace.eot", | |
"DancingScript.eot", | |
"GiveYouGlory.eot", | |
"OvertheRainbow.eot", | |
"GoogleOpenFontLicense.txt", | |
"std-read.css" | |
] | |
EXTRAPAGES = [ | |
"std-characters.htm", | |
"std-backstory.htm", | |
"std-artwork.htm", | |
"6times.htm" | |
] | |
LINKRENAMES = { | |
re.compile("^(\./)?contents"): "6times.htm", | |
re.compile("^(\./)?characters"): "std-characters.htm", | |
re.compile("^(\./)?part-.*"): | |
lambda x: "std-{0}.htm".format(x.replace("-","")), | |
} | |
def main(): | |
# Make sure our destination folder is available. | |
if not os.path.exists(DESTFOLDER): | |
os.makedirs(DESTFOLDER) | |
# Support files. | |
for f in EXTRAFILES: | |
fpath = localFile(f) | |
if not os.path.exists(fpath): | |
print "Downloading %s..." % f | |
u = urllib2.urlopen(remoteFile(f)) | |
data = u.read() | |
with open(fpath, "w") as fl: | |
fl.write(data) | |
# Supporting pages. | |
# Be aware that for these extrapages, some links might not work. | |
for page in EXTRAPAGES: | |
dlWithPix(localFile(page), remoteFile(page)) | |
# The actual story! Yay! | |
for x in range(0, LASTCHAPTER+1): # #0 is the prologue/intro | |
dlWithPix(localFile(FILEPATTERN%x), remoteFile(FILEPATTERN%x)) | |
def localFile(fname): | |
return os.path.join(DESTFOLDER, fname) | |
def remoteFile(fname): | |
return urlparse.urljoin(BASEURL, fname) | |
def dlHtml(dest, url, updatecallback=None): | |
if not os.path.exists(dest) or OVERWRITEHTML: | |
if callable(updatecallback): | |
try: | |
updatecallback("Downloading %s" % dest) | |
except Exception: pass | |
u = urllib2.urlopen(url) | |
soup = BeautifulSoup(u.read()) | |
for before, after in LINKRENAMES.iteritems(): | |
links = soup.findAll("a", href=before) | |
for link in links: | |
# Allow for lambdas/callables. | |
if callable(after): | |
link["href"] = after(link["href"]) | |
else: | |
link["href"] = after | |
with open(dest, "w") as f: | |
f.write(soup.prettify()) | |
else: | |
with open(dest, "r") as f: | |
soup = BeautifulSoup(f.read()) | |
return soup | |
def dlPic(dest, url, updatecallback=None): | |
if not os.path.exists(dest) or OVERWRITEPICS: | |
if callable(updatecallback): | |
try: | |
updatecallback("Downloading %s" % dest) | |
except Exception: pass | |
path = dest.split("/") | |
u = urllib2.urlopen(url) | |
pic = u.read() | |
if len(path) == 1: | |
with open(path[0], "wb") as f: | |
f.write(pic) | |
else: | |
dirs = os.path.join(path[:-1]) | |
try: | |
os.makedirs(dirs[0]) | |
except Exception: | |
pass | |
with open(dest, "wb") as f: | |
f.write(pic) | |
def dlWithPix(dest, url): | |
def callback(msg): | |
print msg | |
html = dlHtml(dest, url, callback) | |
imgs = [image["src"] for image in html.findAll("img")] | |
for fname in imgs: | |
url = remoteFile(fname) | |
dest = localFile(fname) | |
dlPic(dest, url, callback) | |
if __name__ == "__main__": | |
try: | |
main() | |
except KeyboardInterrupt: | |
print "\nGoodbye." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment