Skip to content

Instantly share code, notes, and snippets.

@dongweiming
Last active April 15, 2017 07:26
Show Gist options
  • Save dongweiming/92b1d9f4a89f6b4498254c4427b501a1 to your computer and use it in GitHub Desktop.
Save dongweiming/92b1d9f4a89f6b4498254c4427b501a1 to your computer and use it in GitHub Desktop.
知乎API爬虫
# coding=utf-8
import os
import json
import time
import requests
from requests.auth import AuthBase
from settings import LOGIN_URL, CAPTCHA_URL
from config import (
API_VERSION, APP_VERSION, APP_BUILD, UUID, UA, APP_ZA, CLIENT_ID,
TOKEN_FILE)
from utils import gen_login_signature
from exception import LoginException
LOGIN_DATA = {
'grant_type': 'password',
'source': 'com.zhihu.ios',
'client_id': CLIENT_ID
}
class ZhihuOAuth(AuthBase):
def __init__(self, token=None):
self._token = token
def __call__(self, r):
r.headers['X-API-Version'] = API_VERSION
r.headers['X-APP_VERSION'] = APP_VERSION
r.headers['X-APP-Build'] = APP_BUILD
r.headers['x-app-za'] = APP_ZA
r.headers['X-UDID'] = UUID
r.headers['User-Agent'] = UA
if self._token is None:
auth_str = 'oauth {client_id}'.format(
client_id=CLIENT_ID
)
else:
auth_str = '{type} {token}'.format(
type=str(self._token.type.capitalize()),
token=str(self._token.token)
)
r.headers['Authorization'] = auth_str
return r
class ZhihuToken:
def __init__(self, user_id, uid, access_token, expires_in, token_type,
refresh_token, cookie, lock_in=None, unlock_ticket=None):
self.create_at = time.time()
self.user_id = uid
self.uid = user_id
self.access_token = access_token
self.expires_in = expires_in
self.expires_at = self._create_at + self._expires_in
self.token_type = token_type
self.refresh_token = refresh_token
self.cookie = cookie
# Not used
self._lock_in = lock_in
self._unlock_ticket = unlock_ticket
@classmethod
def from_file(cls, filename):
with open(filename) as f:
return cls.from_dict(json.load(f))
@staticmethod
def save_file(filename, data):
with open(filename, 'w') as f:
json.dump(data, f)
@classmethod
def from_dict(cls, json_dict):
try:
return cls(**json_dict)
except TypeError:
raise ValueError(
'"{json_dict}" is NOT a valid zhihu token json.'.format(
json_dict=json_dict
))
class ZhihuClient:
def __init__(self, username=None, passwd=None, token_file=TOKEN_FILE):
self._session = requests.session()
self._session.verify = False
self.username = username
self.passwd = passwd
if os.path.exists(token_file):
self._token = ZhihuToken.from_file(token_file)
else:
self._login_auth = ZhihuOAuth()
json_dict = self.login()
ZhihuToken.save_file(token_file, json_dict)
self._session.auth = ZhihuOAuth(self._token)
def login(self):
data = LOGIN_DATA.copy()
data['username'] = self.username
data['password'] = self.passwd
gen_login_signature(data)
if self.need_captcha():
captcha_image = self.get_captcha()
with open(CAPTCHA_FILE, 'wb') as f:
f.write(captcha_image)
print('Please open {0} for captcha'.format(
os.path.abspath(CAPTCHA_FILE)))
captcha = input('captcha: ')
os.remove(os.path.abspath(CAPTCHA_FILE))
res = self._session.post(
CAPTCHA_URL,
auth=self._login_auth,
data={'input_text': captcha}
)
try:
json_dict = res.json()
if 'error' in json_dict:
raise LoginException(json_dict['error']['message'])
except (ValueError, KeyError) as e:
raise LoginException('Maybe input wrong captcha value')
res = self._session.post(LOGIN_URL, auth=self._login_auth, data=data)
try:
json_dict = res.json()
if 'error' in json_dict:
raise LoginException(json_dict['error']['message'])
self._token = ZhihuToken.from_dict(json_dict)
return json_dict
except (ValueError, KeyError) as e:
raise LoginException(str(e))
def need_captcha(self):
res = self._session.get(CAPTCHA_URL, auth=self._login_auth)
try:
j = res.json()
return j['show_captcha']
except KeyError:
raise LoginException('Show captcha fail!')
if __name__ == '__main__':
client = ZhihuClient('YOUR_USERNAME', 'YOUR_PASSWORD')
API_VERSION = '3.0.42'
APP_VERSION = '3.28.0'
APP_BUILD = 'release'
UUID = 'AJDA7XkI9glLBWc85sk-nJ_6F0jqALu4AlY='
UA = 'osee2unifiedRelease/3.28.0 (iPhone; iOS 10.2; Scale/2.00)'
APP_ZA = 'OS=iOS&Release=10.2&Model=iPhone8,1&VersionName=3.28.0&VersionCode=558&Width=750&Height='
CLIENT_ID = '8d5227e0aaaa4797a763ac64e0c3b8'
APP_SECRET = b'ecbefbf6b17e47ecb9035107866380'
TOKEN_FILE = 'token.json'
class LoginException(Exception):
def __init__(self, error):
self.error = error
def __repr__(self):
return 'Login Fail: {}'.format(self.error)
__str__ = __repr__
ZHIHU_API_ROOT = 'https://api.zhihu.com'
LOGIN_URL = ZHIHU_API_ROOT + '/sign_in'
CAPTCHA_URL = ZHIHU_API_ROOT + '/captcha'
import hashlib
import hmac
import time
from config import APP_SECRET
def gen_login_signature(data):
data['timestamp'] = str(int(time.time()))
params = ''.join([
data['grant_type'],
data['client_id'],
data['source'],
data['timestamp'],
])
data['signature'] = hmac.new(
APP_SECRET, params.encode('utf-8'), hashlib.sha1).hexdigest()
@7sDream
Copy link

7sDream commented Jan 13, 2017

Hi,我是 zhihu-oauth 的作者~

今天在群里通过朋友分享看到了你公众号上的那篇文章,过感谢你帮着宣传了一波我的项目~

说起来你可能不信,我以前在专栏里写过一点项目介绍,然后被删了 =。= 所以说知乎是不是大度的公司还不一定呢……(参见这里

另外开源软件重要的就是合作,虽然这个项目目前几乎都是我一个人开发的,但还是希望更多的人能够参与进来~

如果你对代码风格,项目组织,实现设计或者其他方面又什么建议的话不妨开个 Issue 一起讨论下 OvO,当然能协助下开发就更好啦。

毕竟作为一个还没毕业的学生,对代码和项目的理解都不太深,如果有人能帮忙指点一二也是很感激的。

以上。

再次感谢。

@dongweiming
Copy link
Author

@7sDream, 被你发现了 ( *・ω・)✄╰ひ╯

看来Pythonista还是爬豆瓣比较安全...

平心而论,你写的非常好,无论当时甚至现在的我也未必能写出来这种带感的项目。Python开发者应该是各语言中最烦人的一类,成天在纠结代码规范、Pythonic或者Idiomatic的感觉,尤其是多了看多了之后渐渐的就会有自己的品味,。提个Issue倒不至于,至多算是对一些地方实现和处理上和我不同,这是非常主观的感觉,无所谓对错 - 编程的乐趣之一就是有无数种方法去完成一个目标。

说具体了, 就是感觉由于炫技增加了很多不环保的代码,也大幅的提高了项目的阅读复杂度。我比较喜欢简单粗暴,因为平时工作中别人的用了复杂的表达方法会增加我维护的成本,已所不欲勿施于人的原则。btw. 我以前也喜欢炫技,以别人看不懂,只能我来改为傲。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment