|
#!/usr/bin/env python3 |
|
|
|
''' |
|
Created on Apr 4, 2015 |
|
|
|
爬取新浪微博中指定用户所有相册图片,并保存到本地 |
|
从命令行中读取所要爬取的userID |
|
|
|
requirements : threadpool, rsa |
|
|
|
''' |
|
|
|
import re |
|
import base64 |
|
import rsa |
|
import binascii |
|
import math |
|
import json |
|
import sys |
|
import os |
|
import threading |
|
import threadpool |
|
import urllib.request |
|
from http.cookiejar import CookieJar |
|
from urllib.request import urlopen |
|
from urllib.parse import urlencode, quote |
|
|
|
# 获取Cookiejar对象(存在本机的cookie消息) |
|
cookie = CookieJar() |
|
# 自定义opener,并将opener跟CookieJar对象绑定 |
|
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie)) |
|
# 添加头部 |
|
opener.addheaders = [ |
|
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) \ |
|
AppleWebKit/537.36 (KHTML, like Gecko) \ |
|
Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36'), |
|
] |
|
# 所有请求都使用此配置 |
|
urllib.request.install_opener(opener) |
|
|
|
uid = '' # 全局变量,用户ID |
|
photo_counter = 0 # 相片计数 |
|
|
|
|
|
def get_data(url): |
|
"""使用GET方法连接服务器""" |
|
return urlopen(url).read().decode('utf-8') |
|
|
|
|
|
def post_data(url, param): |
|
"""使用POST方法连接服务器""" |
|
data = urlencode(param) |
|
return urlopen(url, bytes(data, 'utf-8')).read() |
|
|
|
|
|
def download_img(url, file): |
|
""" |
|
下载图片时要包含'User-Agent',不能直接用'urlretrieve' |
|
""" |
|
|
|
if not os.path.exists(file): |
|
print(url) |
|
with open(file, 'wb') as f: |
|
f.write(urlopen(url).read()) |
|
|
|
|
|
def login_weibo(user=YOUR_USER_NAME, pwd=YOUR_PASSWD): |
|
"""登录微博,一堆加密认证啥的,网上搞的""" |
|
|
|
# 预登录请求,获取参数 |
|
print('正在登录......') |
|
|
|
prelogin_url = ( |
|
'http://login.sina.com.cn/sso/prelogin.php?' |
|
'entry=weibo&callback=sinaSSOController.preloginCallBack' |
|
'&su={:s}&rsakt=mod&client=ssologin.js(v1.4.18)&_=1428121516573' |
|
.format(user)) |
|
preLogin = get_data(prelogin_url) |
|
|
|
# 利用正则表达式提取参数 |
|
servertime = re.findall('"servertime":(.*?),', preLogin)[0] |
|
pubkey = re.findall('"pubkey":"(.*?)",', preLogin)[0] |
|
rsakv = re.findall('"rsakv":"(.*?)",', preLogin)[0] |
|
nonce = re.findall('"nonce":"(.*?)",', preLogin)[0] |
|
|
|
# 用rsa加密算法加密 |
|
su = base64.b64encode(bytes(quote(user), 'utf-8')).decode("utf-8") |
|
|
|
rsaPublickey = int(pubkey, 16) |
|
key = rsa.PublicKey(rsaPublickey, 65537) |
|
message = bytes('{:s}\t{:s}\n{:s}'.format(servertime, nonce, pwd), 'utf-8') |
|
|
|
sp = binascii.b2a_hex(rsa.encrypt(message, key)).decode("utf-8") |
|
|
|
# POST参数 |
|
param = { |
|
'entry': 'weibo', 'gateway': 1, 'from': '', |
|
'savestate': 7, 'useticket': 1, |
|
'pagerefer': 'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D', |
|
'vsnf': 1, 'su': su, 'service': 'miniblog', |
|
'servertime': servertime, 'nonce': nonce, |
|
'pwencode': 'rsa2', 'rsakv': rsakv, |
|
'sp': sp, 'sr': '1680*1050', |
|
'encoding': 'UTF-8', 'prelt': 961, |
|
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack' |
|
} |
|
|
|
# 登录微博 |
|
result = post_data( |
|
'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)', param).decode('gbk') |
|
|
|
# 登录之后还要GET一下啥的,不然还是登不上 |
|
urll = re.findall("location.replace\(\'(.*?)\'\);", result)[0] |
|
get_data(urll) |
|
|
|
print('登录成功\n') |
|
|
|
|
|
def decode_gallery_json(album_list, page): |
|
"""解码JSON文件,从中提取相册 |
|
|
|
爬了相册页面之后,发现尼玛相册居然是利用js加载的 |
|
仔细看看Network,发现服务器会发送一个JSON文件,里面包含相册信息 |
|
更改count为100,减少连接服务器的次数 |
|
|
|
""" |
|
# 获取JSON文件 |
|
json_url = 'http://photo.weibo.com/albums/get_all?uid={:s}&page={:d}&count=100'.format(uid, page) |
|
|
|
json_text = get_data(json_url) # 解码 |
|
result = json.loads(json_text)['data'] # 提取相册信息 |
|
for item in result['album_list']: |
|
album_data = { |
|
'album_id': item['album_id'], |
|
'caption': item['caption'], |
|
'type': item['type'] |
|
} |
|
album_list.append(album_data) # 加入相册列表 |
|
|
|
return result['total'] # 返回总的相册数,以便于遍历所有JSON文件 |
|
|
|
|
|
def decode_album_json(album_id, photo_list, page, theType): |
|
"""爬JSON文件,从中提取照片URL""" |
|
|
|
json_url = ('http://photo.weibo.com/photos/get_all?uid={:s}&album_id={:s}&count=100&page={:d}&type={:s}' |
|
.format(uid, album_id, page, str(theType))) |
|
json_text = get_data(json_url) |
|
result = json.loads(json_text)['data'] |
|
for item in result['photo_list']: |
|
img_url = item['pic_host'] + '/large/' + item['pic_name'] |
|
photo_list.append(img_url) # 直接加图片URL吧 |
|
return result['total'] |
|
|
|
|
|
def get_gallery(): |
|
"""搜集所有相册信息""" |
|
|
|
# 存储相册与其对应的照片 |
|
photo_gallery = {} |
|
print('正在获取相册信息......') |
|
|
|
# 搜集相册 |
|
album_list = [] |
|
album_total = decode_gallery_json(album_list, 1) |
|
album_pages = int(math.ceil(album_total / 100.0)) |
|
while album_pages > 1: # 遍历所有JSON文件 |
|
decode_gallery_json(album_list, album_pages) |
|
album_pages -= 1 |
|
print('成功获取相册信息\n') |
|
|
|
# 搜集照片 |
|
for album in album_list: |
|
caption = album['caption'] # 相册标题 |
|
album_id = album['album_id'] |
|
theType = album['type'] # url中的参数type |
|
|
|
print('正在获取相册:', caption, '......') |
|
photo_list = [] |
|
photo_total = decode_album_json(album_id, photo_list, 1, theType) |
|
photo_pages = int(math.ceil(photo_total / 100.0)) |
|
|
|
# 增加计数器 |
|
global photo_counter |
|
photo_counter += photo_total |
|
|
|
if photo_pages == 0: # 剔除空相册 |
|
print('成功获取相册:相册为空\n') |
|
continue |
|
|
|
# 遍历所有JSON文件 |
|
while photo_pages > 1: |
|
print(' --剩余{:d}{:s}'.format(photo_pages - 1, '00张......')) |
|
decode_album_json(album_id, photo_list, photo_pages, theType) |
|
photo_pages -= 1 |
|
|
|
# 防止相册重名,将重名相册合并 |
|
if caption in photo_gallery: |
|
for item in photo_list: |
|
photo_gallery[caption].append(item) |
|
else: |
|
photo_gallery[caption] = photo_list |
|
|
|
print('成功获取相册:{:s}:{:d} photos\n'.format(caption, len(photo_list))) |
|
|
|
return photo_gallery |
|
|
|
|
|
class Thunder(): |
|
def __init__(self, photo_gallery): |
|
self.photo_gallery = photo_gallery |
|
|
|
# 以uid创建文件夹,放置照片 |
|
if uid not in os.listdir('.'): |
|
os.makedirs(uid) |
|
|
|
def download(self, parent, img_list, flag=True): |
|
'''每个工作线程所做工作,下载`img_list`中的图片到本地 |
|
|
|
`img_list` 存储图片URL |
|
`parent` 图片所属相册,作为文件夹名称 |
|
`flag` 是否打印线程名称 |
|
''' |
|
for img_url in img_list: |
|
img_name = re.findall('large/(.*?)$', img_url)[0] |
|
try: |
|
download_img(img_url, os.path.join(uid, parent, img_name)) |
|
except Exception as e: |
|
print(e) |
|
pass # 下载失败则跳过,避免中断线程 |
|
if flag: |
|
print('{:s} 成功下载\n'.format(threading.current_thread().name)) |
|
|
|
def download_without_pool(self): |
|
'''不使用线程池,每100张图片开一个线程下载''' |
|
|
|
# 记录t,等待所有线程退出 |
|
thread_list = [] |
|
|
|
# 遍历所有相册 |
|
for caption, photo_list in self.photo_gallery.items(): |
|
# 以相册名称创建子文件夹 |
|
if caption not in os.listdir(uid): |
|
os.makedirs(os.path.join(uid, caption)) |
|
|
|
threads = int(math.ceil(len(photo_list) / 100.0)) |
|
for i in range(0, threads): |
|
th = (threading.Thread( |
|
target=self.download, |
|
args=(caption, photo_list[i * 100: (i + 1) * 100]), |
|
name='相册:{:s}:第{:d}部分(共{:d}部分)'.format(caption, i + 1, threads) |
|
)) # 开线程 |
|
thread_list.append(th) |
|
th.start() |
|
|
|
# 等待所有线程退出 |
|
for th in thread_list: |
|
th.join() |
|
|
|
print('\n下载完成,相片总数{:d}'.format(photo_counter)) |
|
print('所有相册存于同目录文件夹{:s}'.format(uid)) |
|
|
|
def callback(self, request, result): |
|
"""任务完成后的回调函数,用于打印消息 |
|
|
|
因为用线程池无法指定线程名称,无法打印消息 |
|
`request` 线程池返回的WorkRequest对象 |
|
`result` 工作函数返回值 |
|
|
|
`caption` `part` `threads`附在request对象上面 |
|
`caption` 传入的相册名称 |
|
`part` 标识相册中的某份100张相片 |
|
`threads` 该相册共有多少部分 |
|
""" |
|
print('相册:{:s}:第{:d}部分(共{:d}部分) 成功下载\n'.format(request.caption, request.part, request.threads)) |
|
|
|
def download_with_pool(self): |
|
"""使用线程池下载,开十个工作线程 |
|
|
|
每100张图片作为一项工作任务 |
|
""" |
|
|
|
# 线程池,初始化10个空线程, |
|
# 工作队列为空,之后向其中添加工作任务 |
|
pool = threadpool.ThreadPool(10) |
|
|
|
# 遍历所有相册 |
|
for caption, photo_list in self.photo_gallery.items(): |
|
os.makedirs(os.path.join(uid, caption)) # 以相册名称创建子文件夹 |
|
threads = int(math.ceil(len(photo_list) / 100.0)) |
|
for i in range(0, threads): |
|
# 每个任务100张图片,加入工作队列 |
|
request = threadpool.WorkRequest(self.download, |
|
(caption, |
|
photo_list[i * 100: (i + 1) * 100], False), |
|
callback=self.callback) |
|
# 将线程信息附在request对象上面 |
|
request.caption = caption |
|
request.part = i + 1 |
|
request.threads = threads |
|
|
|
# 加入线程池 |
|
pool.putRequest(request) |
|
|
|
pool.wait() # 等待所有工作完成 |
|
|
|
print('\n下载完成,相片总数{:d}'.format(photo_counter)) |
|
print('所有相册存于同目录文件夹{:d}'.format(uid)) |
|
|
|
|
|
def main(): |
|
login_weibo() # 登录 |
|
photo_gallery = get_gallery() # 获取相册 |
|
thunder = Thunder(photo_gallery) # 创建下载器 |
|
|
|
# 当照片数达到1000时,或者相册总数达到10时 使用线程池下载 |
|
# 因为对于每个相册,即使里面的照片不足100,也会开一个线程下载 |
|
# 因此要限制相册数量 |
|
if photo_counter >= 1000 or len(photo_gallery) >= 10: |
|
thunder.download_with_pool() |
|
else: |
|
thunder.download_without_pool() |
|
|
|
|
|
if __name__ == '__main__': |
|
if len(sys.argv) != 2: |
|
print('请输入用户ID') |
|
else: |
|
uid = sys.argv[1] |
|
main() |