Skip to content

Instantly share code, notes, and snippets.

@seven332
Last active January 2, 2016 03:49
Show Gist options
  • Save seven332/685d41fdccc035cd4410 to your computer and use it in GitHub Desktop.
Save seven332/685d41fdccc035cd4410 to your computer and use it in GitHub Desktop.
Download archives from http://www.tuku.cc/
# coding=utf-8
import os
import re
import sys
import zipfile
import requests
# Your cookie
headers = {'Cookie': 'your cookie'}
def ensure_dir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def print_same_line(string):
sys.stdout.write('\r%s' % string)
sys.stdout.flush()
# http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
def download_file(url, local_file):
r = requests.get(url, stream=True)
content_length = r.headers.get('Content-Length')
if content_length:
size = int(content_length)
else:
size = -1
downloaded = 0
if size != -1:
print_same_line('0.00%')
with open(local_file, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
downloaded += len(chunk)
if size != -1:
print_same_line('%.2f%%' % (downloaded * 100 / float(size)))
if size != -1:
print
def get_download_url(part_url):
r = requests.get(part_url, headers=headers)
p = re.compile(r'var downurl = Tk\.jsonp_url\+"([^"]+)"')
m = p.search(r.content)
another_url = 'http://rtk.um5.cc/' + m.group(1)
r = requests.get(another_url, headers=headers)
p = re.compile(r'\("<a href=\\"([^"]+)\\"')
m = p.search(r.content.replace('\/', '/'))
return m.group(1)
def get_download_url_list(comic_url):
r = requests.get(comic_url)
p = re.compile(r'<li class="current"><h1>([^<]+)</h1></li>')
m = p.search(r.content)
name = ''.join(c for c in m.group(1).decode('utf-8').encode('gbk') if c not in ('\\', '/', ':', '*', '?', '"', '<', '>', '|')).rstrip()
p = re.compile(r'<a href="([^"]+)" title=".+?">下载</a>')
l = []
for m in p.finditer(r.content):
l.append('http://www.tuku.cc' + m.group(1))
# The newest part is on the top, so reverse the list
l.reverse()
return name, l
# Sometimes download failed, the zip file is broken
def test_zip(zip_file_path):
try:
zipfile.ZipFile(zip_file_path)
return True
except (zipfile.BadZipfile):
print '*~*~*~*~*~*~*~*~ Warning Bad Zip File ~*~*~*~*~*~*~*~*'
os.remove(zip_file_path)
return False
def download_comic(comic_url):
download_dir, part_url_list = get_download_url_list(comic_url)
ensure_dir(download_dir)
error_times = 0
i = 0
for part_url in part_url_list:
i += 1
print '### Part', i
print '* Part Url', part_url
file_name = '%04d.zip' % i
print '* File Name', file_name
file_path = os.path.join(download_dir, file_name)
print '* File Path', file_path
# Check exists
if os.path.exists(file_path) and test_zip(file_path):
print '* Exists', file_name
continue
file_url = get_download_url(part_url)
print '* File Url', file_url
download_file(file_url, file_path)
if not test_zip(file_path):
print '* Part %d Download Failed' % i
error_times += 1
else:
print '* Part %d Download Complete' % i
return error_times
def download_comic_completely(comic_url):
turn = 1
while True:
print '## Turn', turn
if not download_comic(comic_url):
break
else:
turn += 1
def main():
for url in sys.argv[1:]:
print '# Start Comic', url
download_comic_completely(url)
print
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment