Created
September 9, 2010 09:58
-
-
Save mopemope/571672 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import path | |
from werkzeug import secure_filename | |
import eventlet | |
from eventlet.green import urllib2 | |
from pyquery import PyQuery as pq | |
from urlparse import urlparse | |
search_urls = [ | |
'http://www.empflix.com/browsecat.php?page=%s&chid=17&category=rd', | |
] | |
detail_urls = [] | |
id_mode = True | |
save_path = "/home/ma2/Public/empflix/" | |
pool = eventlet.GreenPool(2) | |
import re | |
download_re = re.compile("\s*so.addVariable\('config',\s*'([\w\d\.:/%=_-]*)'\);", re.M) | |
def get_pagelist(url, page=1): | |
q = [] | |
conn = urllib2.urlopen(url % page) | |
page = conn.read() | |
d = pq(page) | |
for span in d(".thumb"): | |
detail_url = pq(span.find("a")).attr.href | |
q.append(detail_url) | |
return q | |
def _get_flv(page): | |
match = download_re.search(page) | |
if match: | |
url = match.group(1) | |
#url = "http://cdn.tnaflix.com/" + url | |
import urllib | |
url = urllib.unquote(url) | |
conn = urllib2.urlopen(url) | |
data = conn.read() | |
d = pq(data) | |
download_url = d("file").text() | |
d = pq(page) | |
file_name = secure_filename(d("h2:first").text() + ".flv") | |
return download_url, file_name | |
def get_download_url(url): | |
conn = urllib2.urlopen(url) | |
page = "".join(conn.readlines()) | |
d = pq(page) | |
#download_url = d(".linkRight a:first").attr.href | |
download_url = d(".downloadButton").attr.href | |
if download_url: | |
parsed = urlparse(download_url) | |
file_name = parsed.path.split("/")[-1] | |
else: | |
download_url, file_name = _get_flv(page) | |
return url, download_url, file_name | |
def download_flv(url, down_url, file_name): | |
print "'%s' ---- Try Download ----" % url | |
out_path = path.join(save_path, file_name) | |
if not file_name: | |
print "'%s' ** Not Found Link ** " % url | |
return | |
partial = False | |
try: | |
conn = urllib2.urlopen(down_url) | |
length = conn.info()['Content-Length'] | |
length = int(length) | |
if length < 1024 * 1024 * 50 or length > 1024 * 1024 * 900: | |
print "*** '%s' is small! Skip!!!'%s' ***" % (url, length) | |
return | |
if path.exists(out_path): | |
size = path.getsize(out_path) | |
if size < length: | |
r = "bytes=%s-" % size | |
req = urllib2.Request(down_url, headers={"Range":r}) | |
conn = urllib2.urlopen(req) | |
print "'%s' == Resume!! '%s' ==" % (url, file_name) | |
print "'%s' == File '%s' Size: %d/%d'" % (url, file_name, size, length) | |
partial = True | |
else: | |
print "'%s' == Downloaded '%s' ==" % (url, file_name) | |
return | |
except: | |
import traceback | |
print traceback.format_exc() | |
pool.spawn_n(download, url) | |
return | |
if partial: | |
f = open(out_path, "rb+") | |
f.seek(0, 2) | |
else: | |
f = open(out_path, "wb") | |
print "'%s' == Start '%s' ==" % (url, file_name) | |
while True: | |
data = conn.read(1024 * 512 ) | |
if not data: | |
break | |
f.write(data) | |
#per = path.getsize(out_path) / float(length) * 100.0 | |
#print "'%s' == '%s' %d%% done. ==" % (url, file_name, per) | |
print "'%s' == Finish '%s' ==" % (url, file_name) | |
def download(url): | |
url, download_url, file_name = get_download_url(url) | |
id = urlparse(url).query[3:] | |
if id_mode: | |
file_name = id + "_" + file_name | |
if not download_url.startswith('#'): | |
if file_name.lower().find('mosaic') == -1: | |
download_flv(url, download_url, file_name) | |
q = [] | |
def start(url, min_page=1, max_page=12): | |
for i in xrange(min_page, max_page+1): | |
urls = get_pagelist(url, page=i) | |
q.extend(urls) | |
q.reverse() | |
while q: | |
url = q.pop() | |
pool.spawn_n(download, url) | |
def read_detail_urls(file='empflix.txt'): | |
i = 0 | |
for href in open(file): | |
i += 1 | |
href = href.strip() | |
if href: | |
detail_urls.append(href) | |
if __name__ == '__main__': | |
for url in search_urls: | |
start(url=url) | |
pool.waitall() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment