Last active
January 2, 2016 03:49
-
-
Save seven332/685d41fdccc035cd4410 to your computer and use it in GitHub Desktop.
Download archives from http://www.tuku.cc/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import os | |
import re | |
import sys | |
import zipfile | |
import requests | |
# Your cookie | |
headers = {'Cookie': 'your cookie'} | |
def ensure_dir(directory): | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
def print_same_line(string): | |
sys.stdout.write('\r%s' % string) | |
sys.stdout.flush() | |
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py | |
def download_file(url, local_file): | |
r = requests.get(url, stream=True) | |
content_length = r.headers.get('Content-Length') | |
if content_length: | |
size = int(content_length) | |
else: | |
size = -1 | |
downloaded = 0 | |
if size != -1: | |
print_same_line('0.00%') | |
with open(local_file, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
f.flush() | |
downloaded += len(chunk) | |
if size != -1: | |
print_same_line('%.2f%%' % (downloaded * 100 / float(size))) | |
if size != -1: | |
def get_download_url(part_url): | |
r = requests.get(part_url, headers=headers) | |
p = re.compile(r'var downurl = Tk\.jsonp_url\+"([^"]+)"') | |
m = p.search(r.content) | |
another_url = 'http://rtk.um5.cc/' + m.group(1) | |
r = requests.get(another_url, headers=headers) | |
p = re.compile(r'\("<a href=\\"([^"]+)\\"') | |
m = p.search(r.content.replace('\/', '/')) | |
return m.group(1) | |
def get_download_url_list(comic_url): | |
r = requests.get(comic_url) | |
p = re.compile(r'<li class="current"><h1>([^<]+)</h1></li>') | |
m = p.search(r.content) | |
name = ''.join(c for c in m.group(1).decode('utf-8').encode('gbk') if c not in ('\\', '/', ':', '*', '?', '"', '<', '>', '|')).rstrip() | |
p = re.compile(r'<a href="([^"]+)" title=".+?">下载</a>') | |
l = [] | |
for m in p.finditer(r.content): | |
l.append('http://www.tuku.cc' + m.group(1)) | |
# The newest part is on the top, so reverse the list | |
l.reverse() | |
return name, l | |
# Sometimes download failed, the zip file is broken | |
def test_zip(zip_file_path): | |
try: | |
zipfile.ZipFile(zip_file_path) | |
return True | |
except (zipfile.BadZipfile): | |
print '*~*~*~*~*~*~*~*~ Warning Bad Zip File ~*~*~*~*~*~*~*~*' | |
os.remove(zip_file_path) | |
return False | |
def download_comic(comic_url): | |
download_dir, part_url_list = get_download_url_list(comic_url) | |
ensure_dir(download_dir) | |
error_times = 0 | |
i = 0 | |
for part_url in part_url_list: | |
i += 1 | |
print '### Part', i | |
print '* Part Url', part_url | |
file_name = '%04d.zip' % i | |
print '* File Name', file_name | |
file_path = os.path.join(download_dir, file_name) | |
print '* File Path', file_path | |
# Check exists | |
if os.path.exists(file_path) and test_zip(file_path): | |
print '* Exists', file_name | |
continue | |
file_url = get_download_url(part_url) | |
print '* File Url', file_url | |
download_file(file_url, file_path) | |
if not test_zip(file_path): | |
print '* Part %d Download Failed' % i | |
error_times += 1 | |
else: | |
print '* Part %d Download Complete' % i | |
return error_times | |
def download_comic_completely(comic_url): | |
turn = 1 | |
while True: | |
print '## Turn', turn | |
if not download_comic(comic_url): | |
break | |
else: | |
turn += 1 | |
def main(): | |
for url in sys.argv[1:]: | |
print '# Start Comic', url | |
download_comic_completely(url) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment