Created
March 4, 2015 15:31
-
-
Save brickgao/026ab0fc2c8cf6534b9e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- encoding: utf-8 -*- | |
import requests | |
import os | |
def get_page(lower_bound=1, upper_bound=1000000): | |
if not os.path.exists('./pages/'): | |
os.mkdir('./pages') | |
base_url = 'http://zhidao.baidu.com/question/' | |
for question_num in range(lower_bound, upper_bound): | |
url = base_url + '%d.html' % question_num | |
r = requests.get(url) | |
_text = r.content.decode('gbk') | |
if r.status_code == 404 or u'该问题可能已经失效' in _text: | |
print 'Question No. %d not found' % question_num | |
elif r.status_code == 200: | |
with open('./pages/%d.html' % question_num, 'w') as file_saved: | |
_text_encode = _text.encode('utf-8') | |
file_saved.write(_text_encode) | |
print 'Save Question No. %d as ./pages/%d.html' \ | |
% (question_num, question_num) | |
else: | |
print 'Request Error, code %d' % r.status_code | |
if __name__ == '__main__': | |
get_page(1, 100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment