Created
November 2, 2014 13:09
-
-
Save dragonly/7513e9c007052625d242 to your computer and use it in GitHub Desktop.
weibo crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding=utf8 | |
''' | |
Created on Mar 18, 2013 | |
@author: yoyzhou | |
''' | |
''' | |
Updated on APril 16, 2014 | |
@author: wanghaisheng | |
''' | |
try: | |
import os | |
import sys | |
import urllib | |
import urllib2 | |
import cookielib | |
import base64 | |
import re | |
import hashlib | |
import json | |
import rsa | |
import binascii | |
import getpass | |
except ImportError: | |
print >> sys.stderr, """\ | |
There was a problem importing one of the Python modules required. | |
The error leading to this problem was: | |
%s | |
Please install a package which provides this module, or | |
verify that the module is installed correctly. | |
It's possible that the above module doesn't match the current version of Python, | |
which is: | |
%s | |
""" % (sys.exc_info(), sys.version) | |
sys.exit(1) | |
__prog__= "weibo_login" | |
__site__= "http://yoyzhou.github.com" | |
__weibo__= "@pigdata" | |
__version__="0.1" | |
def get_prelogin_status(username): | |
""" | |
Perform prelogin action, get prelogin status, including servertime, nonce, rsakv, etc. | |
""" | |
#prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&client=ssologin.js(v1.4.5)' | |
prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=' + get_user(username) + \ | |
'&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)'; | |
data = urllib2.urlopen(prelogin_url).read() | |
p = re.compile('\((.*)\)') | |
try: | |
json_data = p.search(data).group(1) | |
data = json.loads(json_data) | |
servertime = str(data['servertime']) | |
nonce = data['nonce'] | |
rsakv = data['rsakv'] | |
return servertime, nonce, rsakv | |
except: | |
print 'Getting prelogin status met error!' | |
return None | |
def login(username, pwd, cookie_file): | |
"""" | |
Login with use name, password and cookies. | |
(1) If cookie file exists then try to load cookies; | |
(2) If no cookies found then do login | |
""" | |
#If cookie file exists then try to load cookies | |
if os.path.exists(cookie_file): | |
try: | |
cookie_jar = cookielib.LWPCookieJar(cookie_file) | |
cookie_jar.load(ignore_discard=True, ignore_expires=True) | |
loaded = 1 | |
except cookielib.LoadError: | |
loaded = 0 | |
print 'Loading cookies error' | |
#install loaded cookies for urllib2 | |
if loaded: | |
cookie_support = urllib2.HTTPCookieProcessor(cookie_jar) | |
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) | |
urllib2.install_opener(opener) | |
print 'Loading cookies success' | |
return 1 | |
else: | |
return do_login(username, pwd, cookie_file) | |
else: #If no cookies found | |
return do_login(username, pwd, cookie_file) | |
def do_login(username,pwd,cookie_file): | |
"""" | |
Perform login action with use name, password and saving cookies. | |
@param username: login user name | |
@param pwd: login password | |
@param cookie_file: file name where to save cookies when login succeeded | |
""" | |
#POST data per LOGIN WEIBO, these fields can be captured using httpfox extension in Firefox | |
login_data = { | |
'entry': 'weibo', | |
'gateway': '1', | |
'from': '', | |
'savestate': '7', | |
'userticket': '1', | |
'pagerefer':'', | |
'vsnf': '1', | |
'su': '', | |
'service': 'miniblog', | |
'servertime': '', | |
'nonce': '', | |
'pwencode': 'rsa2', | |
'rsakv': '', | |
'sp': '', | |
'encoding': 'UTF-8', | |
'prelt': '45', | |
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', | |
'returntype': 'META' | |
} | |
cookie_jar2 = cookielib.LWPCookieJar() | |
cookie_support2 = urllib2.HTTPCookieProcessor(cookie_jar2) | |
opener2 = urllib2.build_opener(cookie_support2, urllib2.HTTPHandler) | |
urllib2.install_opener(opener2) | |
login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' | |
try: | |
servertime, nonce, rsakv = get_prelogin_status(username) | |
except: | |
return 0 | |
#Fill POST data | |
print 'starting to set login_data' | |
login_data['servertime'] = servertime | |
login_data['nonce'] = nonce | |
login_data['su'] = get_user(username) | |
login_data['sp'] = get_pwd_rsa(pwd, servertime, nonce) | |
login_data['rsakv'] = rsakv | |
login_data = urllib.urlencode(login_data) | |
http_headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0'} | |
req_login = urllib2.Request( | |
url = login_url, | |
data = login_data, | |
headers = http_headers | |
) | |
result = urllib2.urlopen(req_login) | |
text = result.read() | |
p = re.compile('location\.replace\(\'(.*?)\'\)') | |
#在使用httpfox登录调试时,我获取的返回参数 location.replace('http://weibo.com 这里使用的是单引号 原来的正则中匹配的是双引号# 导致没有login_url得到 单引号本身在re中无需转义 | |
#p = re.compile('location\.replace\(\B'(.*?)'\B\)') 经调试,这样子是错误的 re中使用\'才能表达单引号 | |
try: | |
#Search login redirection URL | |
login_url = p.search(text).group(1) | |
data = urllib2.urlopen(login_url).read() | |
#Verify login feedback, check whether result is TRUE | |
patt_feedback = 'feedBackUrlCallBack\((.*)\)' | |
p = re.compile(patt_feedback, re.MULTILINE) | |
feedback = p.search(data).group(1) | |
feedback_json = json.loads(feedback) | |
if feedback_json['result']: | |
cookie_jar2.save(cookie_file,ignore_discard=True, ignore_expires=True) | |
return 1 | |
else: | |
return 0 | |
except: | |
return 0 | |
def get_pwd_wsse(pwd, servertime, nonce): | |
""" | |
Get wsse encrypted password | |
""" | |
pwd1 = hashlib.sha1(pwd).hexdigest() | |
pwd2 = hashlib.sha1(pwd1).hexdigest() | |
pwd3_ = pwd2 + servertime + nonce | |
pwd3 = hashlib.sha1(pwd3_).hexdigest() | |
return pwd3 | |
def get_pwd_rsa(pwd, servertime, nonce): | |
""" | |
Get rsa2 encrypted password, using RSA module from https://pypi.python.org/pypi/rsa/3.1.1, documents can be accessed at | |
http://stuvel.eu/files/python-rsa-doc/index.html | |
""" | |
#n, n parameter of RSA public key, which is published by WEIBO.COM | |
#hardcoded here but you can also find it from values return from prelogin status above | |
weibo_rsa_n = 'EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443' | |
#e, exponent parameter of RSA public key, WEIBO uses 0x10001, which is 65537 in Decimal | |
weibo_rsa_e = 65537 | |
message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) | |
#construct WEIBO RSA Publickey using n and e above, note that n is a hex string | |
key = rsa.PublicKey(int(weibo_rsa_n, 16), weibo_rsa_e) | |
#get encrypted password | |
encropy_pwd = rsa.encrypt(message, key) | |
#trun back encrypted password binaries to hex string | |
return binascii.b2a_hex(encropy_pwd) | |
def get_user(username): | |
username_ = urllib.quote(username) | |
username = base64.encodestring(username_)[:-1] | |
return username | |
if __name__ == '__main__': | |
username = '18817583755' | |
pwd = getpass.getpass() | |
# pwd = '' | |
cookie_file = 'weibo_login_cookies.dat' | |
if login(username, pwd, cookie_file): | |
print 'Login WEIBO succeeded' | |
html = urllib2.urlopen('http://weibo.com/p/1035051708942053/follow?page=5').read() | |
pFollowItem = r""" | |
<li\ class=\\"follow_item[\s\S]*? # beginning of a fan | |
uid=(?P<uid>\d+)& # uid | |
fnick=(?P<nickname>[^&]+)& # nickname | |
sex=(?P<gender>[^\\]+)[\s\S]*? # gender | |
(?: | |
(?P<approved>微博个人认证) # approved person | |
| | |
(?P<approved_co>微博机构认证) # approved company | |
)?[\s\S]*? | |
关注\ <em[^>]+?><a[^>]+?>(?P<follwing>\d+)[\s\S]*? # following | |
粉丝<em[^>]+?><a[^>]+?>(?P<fans>\d+)[\s\S]*? # fans number | |
微博<em[^>]+?><a[^>]+?>(?P<weibo>\d+)[\s\S]*? # weibo number | |
地址<\\/em><span>(?P<address>[^<]+)[\s\S]*? # weibo number | |
info_intro\\"><span>(?P<introduction>[^<]+)[\s\S]*? | |
<\\/li> # end of a fan | |
""" | |
r = re.compile(pFollowItem, re.X) | |
iter = r.finditer(html) | |
# print len(result) | |
# print result | |
for i in iter: | |
gDict = i.groupdict() | |
for (key, value) in gDict.items(): | |
# sys.stdout.write(key + ': ' + value + ' | ') | |
print key, ': ', value | |
# print i.groups() | |
print '-' * 10 | |
else: | |
print 'Login WEIBO failed' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment