Skip to content

Instantly share code, notes, and snippets.

@cnbeining
Last active February 14, 2018 08:19
Show Gist options
  • Save cnbeining/17a9a58b4a3d76f72d50 to your computer and use it in GitHub Desktop.
Save cnbeining/17a9a58b4a3d76f72d50 to your computer and use it in GitHub Desktop.
Batch download pp.163.com | 批量下载网易摄影 pp.163.com 的照片
#!/usr/bin/env python
#coding:utf-8
# Author: Beining http://www.cnbeining.com/ cnbeining[at]gmail.com
# Purpose: Batch download pp.163.com
# Created: 03/04/2015
# License: GNU GPL 2.0 https://www.gnu.org/licenses/gpl-2.0.html
import os
import sys
import unittest
import urllib2
import logging
import re
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
import getopt
import subprocess
global DOWNLOAD_SOFTWARE, FAKE_HEADER, LOCATION_DIR, resolution
FAKE_HEADER = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.16 Safari/537.36',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache'}
LOCATION_DIR = os.getcwd()
DOWNLOAD_SOFTWARE = 'wget'
#----------------------------------------------------------------------
def page_reader(url):
"""str->str
read pages."""
request = urllib2.Request(url, headers=FAKE_HEADER)
response = urllib2.urlopen(request)
data = response.read()
return data
#----------------------------------------------------------------------
def page_parser(webpage):
"""str->dict
url:http://pp.163.com/daowuzhe123/pp/13424132.html
prpr~"""
logging.info('Retriving purl...')
for i in webpage.split('\n'):
if 'purl' in i:
purl = 'http://' + i.strip()[6:-2]
#http://s1.ph.126.net/WwP8GD1A3ocjPfENOdgrdQ==/192414543510075.js
for i in webpage.split('\n'):
if 'name:' in i:
folder_name = i.decode('gbk').strip()[7:-2]
print(folder_name)
break
try:
os.mkdir(folder_name)
except Exception:
pass
os.chdir(LOCATION_DIR + '/' + folder_name)
purl_data = page_reader(purl)
purl_processed = purl_data.split('[{')[1].split('}]')[0].split('},{')
purl_processed_list =['{' + i + '}' for i in purl_processed]
pattern = r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:"
repl = lambda match: '"{}":'.format(match.group(1))
dict_big = {}
#print(purl_processed_list)
for i in purl_processed_list:
#print(i)
dict_this = {}
dict_this = eval(re.sub(pattern, repl, i))
#print(dict_this)
photoId = dict_this['photoId']
#print(photoId)
dict_big[photoId] = dict_this
return dict_big
#----------------------------------------------------------------------
def download_video_link((filename, DOWNLOAD_SOFTWARE, img_url)):
""""""
logging.info('Downloading #{filename}...'.format(filename = filename))
if DOWNLOAD_SOFTWARE == 'aria2c':
cmd = 'aria2c -c -k1M --out {filename} "{img_url}"'
elif DOWNLOAD_SOFTWARE == 'wget':
cmd = 'wget -c -O {filename} "{img_url}"'
elif DOWNLOAD_SOFTWARE == 'curl':
cmd = 'curl -L -C -o {filename} "{img_url}"'
elif DOWNLOAD_SOFTWARE == 'axel':
cmd = 'axel -o {filename} "{img_url}"'
cmd = cmd.format(filename = filename, img_url = img_url)
logging.debug(cmd)
execute_cmd(cmd)
#----------------------------------------------------------------------
def execute_cmd(cmd):
""""""
return_code = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if return_code == 0:
pass
else:
logging.warning('ERROR')
return return_code
#----------------------------------------------------------------------
def parse_list(img_dict, resolution):
"""dict->None"""
down_list = []
for i in img_dict:
filename = str(img_dict[i]['photoId']) + '.' + img_dict[i][resolution].split('.')[-1]
img_url = 'http://img' + img_dict[i][resolution][0] + '.ph.126.net' + img_dict[i][resolution][1:]
down_list.append((filename, DOWNLOAD_SOFTWARE, img_url))
return down_list
#----------------------------------------------------------------------
def downloader(down_list, workers = 5):
""""""
from multiprocessing.dummy import Pool as ThreadPool
# Make the Pool of workers
pool = ThreadPool(int(workers))
# Open the urls in their own threads
# and return the results
results = pool.map(download_video_link, down_list)
#close the pool and wait for the work to finish
pool.close()
pool.join()
#----------------------------------------------------------------------
def main(link, resolution):
""""""
page_data = page_reader(link)
link_dict = page_parser(page_data)
down_list = parse_list(link_dict, resolution)
downloader(down_list, 5)
if __name__=='__main__':
resolution = sys.argv[1]
argv_list = sys.argv[2:]
for link in argv_list:
os.chdir(LOCATION_DIR)
main(link, resolution)
print('Done!')

Batch download pp.163.com

批量下载网易摄影 pp.163.com 的照片

使用方法:

修改成你需要的线程和下载器 然后:

python 163pp.py [分辨率] URL1 URL2...

分辨率:
murl 中等
surl 小
lurl 好像是很小很小。。。。。。。。
turl 比小还小
qurl 正方形
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment