Skip to content

Instantly share code, notes, and snippets.

@YieldNull
Last active July 14, 2021 04:31
Show Gist options
  • Save YieldNull/96036689bc832d09bc3c to your computer and use it in GitHub Desktop.
Save YieldNull/96036689bc832d09bc3c to your computer and use it in GitHub Desktop.
爬取新浪微博中指定用户所有相册图片,并保存到本地

功能

给定用户的ID(微相册的数字ID,非昵称),下载用户相册中所有图片到本地

所需模块

rsa,threadpool

登陆

请先更改login_weibo中的用户名及密码

用法

在Ubuntu 14.04平台下

$ virtualenv venv
$ source venv/bin/activate
$ pip install threadpool
$ pip install rsa
$ deactive
$ venv/bin/python weitu.py <UserId>

说明

下载完成后,图片保存在当前目录下以uid为名的文件夹中

Update 2017.08.31

  • python2 to python3

Update 2015.11.13

  • 增加Readme
  • 修复上传代码时的错误
  • 下载图片时添加Header
#!/usr/bin/env python3
'''
Created on Apr 4, 2015
爬取新浪微博中指定用户所有相册图片,并保存到本地
从命令行中读取所要爬取的userID
requirements : threadpool, rsa
'''
import re
import base64
import rsa
import binascii
import math
import json
import sys
import os
import threading
import threadpool
import urllib.request
from http.cookiejar import CookieJar
from urllib.request import urlopen
from urllib.parse import urlencode, quote
# 获取Cookiejar对象(存在本机的cookie消息)
cookie = CookieJar()
# 自定义opener,并将opener跟CookieJar对象绑定
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
# 添加头部
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36'),
]
# 所有请求都使用此配置
urllib.request.install_opener(opener)
uid = '' # 全局变量,用户ID
photo_counter = 0 # 相片计数
def get_data(url):
"""使用GET方法连接服务器"""
return urlopen(url).read().decode('utf-8')
def post_data(url, param):
"""使用POST方法连接服务器"""
data = urlencode(param)
return urlopen(url, bytes(data, 'utf-8')).read()
def download_img(url, file):
"""
下载图片时要包含'User-Agent',不能直接用'urlretrieve'
"""
if not os.path.exists(file):
print(url)
with open(file, 'wb') as f:
f.write(urlopen(url).read())
def login_weibo(user=YOUR_USER_NAME, pwd=YOUR_PASSWD):
"""登录微博,一堆加密认证啥的,网上搞的"""
# 预登录请求,获取参数
print('正在登录......')
prelogin_url = (
'http://login.sina.com.cn/sso/prelogin.php?'
'entry=weibo&callback=sinaSSOController.preloginCallBack'
'&su={:s}&rsakt=mod&client=ssologin.js(v1.4.18)&_=1428121516573'
.format(user))
preLogin = get_data(prelogin_url)
# 利用正则表达式提取参数
servertime = re.findall('"servertime":(.*?),', preLogin)[0]
pubkey = re.findall('"pubkey":"(.*?)",', preLogin)[0]
rsakv = re.findall('"rsakv":"(.*?)",', preLogin)[0]
nonce = re.findall('"nonce":"(.*?)",', preLogin)[0]
# 用rsa加密算法加密
su = base64.b64encode(bytes(quote(user), 'utf-8')).decode("utf-8")
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537)
message = bytes('{:s}\t{:s}\n{:s}'.format(servertime, nonce, pwd), 'utf-8')
sp = binascii.b2a_hex(rsa.encrypt(message, key)).decode("utf-8")
# POST参数
param = {
'entry': 'weibo', 'gateway': 1, 'from': '',
'savestate': 7, 'useticket': 1,
'pagerefer': 'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl%3D',
'vsnf': 1, 'su': su, 'service': 'miniblog',
'servertime': servertime, 'nonce': nonce,
'pwencode': 'rsa2', 'rsakv': rsakv,
'sp': sp, 'sr': '1680*1050',
'encoding': 'UTF-8', 'prelt': 961,
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack'
}
# 登录微博
result = post_data(
'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)', param).decode('gbk')
# 登录之后还要GET一下啥的,不然还是登不上
urll = re.findall("location.replace\(\'(.*?)\'\);", result)[0]
get_data(urll)
print('登录成功\n')
def decode_gallery_json(album_list, page):
"""解码JSON文件,从中提取相册
爬了相册页面之后,发现尼玛相册居然是利用js加载的
仔细看看Network,发现服务器会发送一个JSON文件,里面包含相册信息
更改count为100,减少连接服务器的次数
"""
# 获取JSON文件
json_url = 'http://photo.weibo.com/albums/get_all?uid={:s}&page={:d}&count=100'.format(uid, page)
json_text = get_data(json_url) # 解码
result = json.loads(json_text)['data'] # 提取相册信息
for item in result['album_list']:
album_data = {
'album_id': item['album_id'],
'caption': item['caption'],
'type': item['type']
}
album_list.append(album_data) # 加入相册列表
return result['total'] # 返回总的相册数,以便于遍历所有JSON文件
def decode_album_json(album_id, photo_list, page, theType):
"""爬JSON文件,从中提取照片URL"""
json_url = ('http://photo.weibo.com/photos/get_all?uid={:s}&album_id={:s}&count=100&page={:d}&type={:s}'
.format(uid, album_id, page, str(theType)))
json_text = get_data(json_url)
result = json.loads(json_text)['data']
for item in result['photo_list']:
img_url = item['pic_host'] + '/large/' + item['pic_name']
photo_list.append(img_url) # 直接加图片URL吧
return result['total']
def get_gallery():
"""搜集所有相册信息"""
# 存储相册与其对应的照片
photo_gallery = {}
print('正在获取相册信息......')
# 搜集相册
album_list = []
album_total = decode_gallery_json(album_list, 1)
album_pages = int(math.ceil(album_total / 100.0))
while album_pages > 1: # 遍历所有JSON文件
decode_gallery_json(album_list, album_pages)
album_pages -= 1
print('成功获取相册信息\n')
# 搜集照片
for album in album_list:
caption = album['caption'] # 相册标题
album_id = album['album_id']
theType = album['type'] # url中的参数type
print('正在获取相册:', caption, '......')
photo_list = []
photo_total = decode_album_json(album_id, photo_list, 1, theType)
photo_pages = int(math.ceil(photo_total / 100.0))
# 增加计数器
global photo_counter
photo_counter += photo_total
if photo_pages == 0: # 剔除空相册
print('成功获取相册:相册为空\n')
continue
# 遍历所有JSON文件
while photo_pages > 1:
print(' --剩余{:d}{:s}'.format(photo_pages - 1, '00张......'))
decode_album_json(album_id, photo_list, photo_pages, theType)
photo_pages -= 1
# 防止相册重名,将重名相册合并
if caption in photo_gallery:
for item in photo_list:
photo_gallery[caption].append(item)
else:
photo_gallery[caption] = photo_list
print('成功获取相册:{:s}:{:d} photos\n'.format(caption, len(photo_list)))
return photo_gallery
class Thunder():
def __init__(self, photo_gallery):
self.photo_gallery = photo_gallery
# 以uid创建文件夹,放置照片
if uid not in os.listdir('.'):
os.makedirs(uid)
def download(self, parent, img_list, flag=True):
'''每个工作线程所做工作,下载`img_list`中的图片到本地
`img_list` 存储图片URL
`parent` 图片所属相册,作为文件夹名称
`flag` 是否打印线程名称
'''
for img_url in img_list:
img_name = re.findall('large/(.*?)$', img_url)[0]
try:
download_img(img_url, os.path.join(uid, parent, img_name))
except Exception as e:
print(e)
pass # 下载失败则跳过,避免中断线程
if flag:
print('{:s} 成功下载\n'.format(threading.current_thread().name))
def download_without_pool(self):
'''不使用线程池,每100张图片开一个线程下载'''
# 记录t,等待所有线程退出
thread_list = []
# 遍历所有相册
for caption, photo_list in self.photo_gallery.items():
# 以相册名称创建子文件夹
if caption not in os.listdir(uid):
os.makedirs(os.path.join(uid, caption))
threads = int(math.ceil(len(photo_list) / 100.0))
for i in range(0, threads):
th = (threading.Thread(
target=self.download,
args=(caption, photo_list[i * 100: (i + 1) * 100]),
name='相册:{:s}:第{:d}部分(共{:d}部分)'.format(caption, i + 1, threads)
)) # 开线程
thread_list.append(th)
th.start()
# 等待所有线程退出
for th in thread_list:
th.join()
print('\n下载完成,相片总数{:d}'.format(photo_counter))
print('所有相册存于同目录文件夹{:s}'.format(uid))
def callback(self, request, result):
"""任务完成后的回调函数,用于打印消息
因为用线程池无法指定线程名称,无法打印消息
`request` 线程池返回的WorkRequest对象
`result` 工作函数返回值
`caption` `part` `threads`附在request对象上面
`caption` 传入的相册名称
`part` 标识相册中的某份100张相片
`threads` 该相册共有多少部分
"""
print('相册:{:s}:第{:d}部分(共{:d}部分) 成功下载\n'.format(request.caption, request.part, request.threads))
def download_with_pool(self):
"""使用线程池下载,开十个工作线程
每100张图片作为一项工作任务
"""
# 线程池,初始化10个空线程,
# 工作队列为空,之后向其中添加工作任务
pool = threadpool.ThreadPool(10)
# 遍历所有相册
for caption, photo_list in self.photo_gallery.items():
os.makedirs(os.path.join(uid, caption)) # 以相册名称创建子文件夹
threads = int(math.ceil(len(photo_list) / 100.0))
for i in range(0, threads):
# 每个任务100张图片,加入工作队列
request = threadpool.WorkRequest(self.download,
(caption,
photo_list[i * 100: (i + 1) * 100], False),
callback=self.callback)
# 将线程信息附在request对象上面
request.caption = caption
request.part = i + 1
request.threads = threads
# 加入线程池
pool.putRequest(request)
pool.wait() # 等待所有工作完成
print('\n下载完成,相片总数{:d}'.format(photo_counter))
print('所有相册存于同目录文件夹{:d}'.format(uid))
def main():
login_weibo() # 登录
photo_gallery = get_gallery() # 获取相册
thunder = Thunder(photo_gallery) # 创建下载器
# 当照片数达到1000时,或者相册总数达到10时 使用线程池下载
# 因为对于每个相册,即使里面的照片不足100,也会开一个线程下载
# 因此要限制相册数量
if photo_counter >= 1000 or len(photo_gallery) >= 10:
thunder.download_with_pool()
else:
thunder.download_without_pool()
if __name__ == '__main__':
if len(sys.argv) != 2:
print('请输入用户ID')
else:
uid = sys.argv[1]
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment