Created
March 22, 2012 05:12
-
-
Save darvell/2156265 to your computer and use it in GitHub Desktop.
GH Downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I am totally aware BeautifulSoup is slow. | |
# Please do not inform me, this is just for fun. | |
import urllib2 | |
import BeautifulSoup | |
import re | |
import os | |
import sys | |
from threading import Thread | |
def slash(): | |
if os.name == "nt": | |
return "\\" | |
else: | |
return "/" | |
def replace_all(text, dic): | |
for i, j in dic.iteritems(): | |
text = text.replace(i, j) | |
return text | |
class download(Thread): | |
def __init__(self,url): | |
Thread.__init__(self) | |
self.url = url | |
def run(self): | |
req = urllib2.urlopen(self.url) | |
CHUNK = 16 * 1024 | |
with open(os.getcwd() + slash() + SOUNDTRACK_NAME + slash() + self.url.split('/')[-1], 'wb') as fp: | |
while True: | |
chunk = req.read(CHUNK) | |
if not chunk: break | |
fp.write(chunk) | |
print self.url.split('/')[-1],'complete.' | |
BASE_URL = "http://gh.ffshrine.org/" | |
SOUNDTRACK_ID = int(raw_input('Soundtrack ID (e.g. http://gh.ffshrine.org/soundtracks/NUMBER/): ')) | |
html_dict = {'%3A':':','%2F':'/','%27':"'",'%22':'"','%3B':';','%28':'(','%29':')'} | |
page_data = urllib2.urlopen(BASE_URL + "soundtracks/" + str(SOUNDTRACK_ID)).read() | |
soup = BeautifulSoup.BeautifulSoup(page_data) | |
SOUNDTRACK_NAME = str(soup.findAll('h1')[0])[4:-19] | |
url_list = [] | |
print 'Found soundtrack:',SOUNDTRACK_NAME | |
print 'Parsing all pages for download links. This may take a while.' | |
for a in soup.findAll('a',attrs={'href':re.compile("^/song/")}): | |
song_url = a['href'] | |
download_page_data = urllib2.urlopen(BASE_URL + song_url[1:]).read() | |
temp_soup = BeautifulSoup.BeautifulSoup(download_page_data) | |
for script in temp_soup.findAll('script'): | |
if 'var data' in str(script): | |
# Actual stuff we want always is between chars 42 and len(script) - 38 | |
java = replace_all(str(script)[42:-38],html_dict) | |
url_list.append(java[0:java.index('";')]) | |
print "List built, starting download." | |
# Maybe it exists, fuck you? | |
try: | |
os.mkdir(SOUNDTRACK_NAME) | |
except: | |
pass | |
threadlist = [] | |
for i in range(0,len(url_list)): | |
cur_thread = download(url_list[i]) | |
threadlist.append(cur_thread) | |
cur_thread.start() | |
print 'Started thread for:',url_list[i].split('/')[-1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment