Created
October 21, 2015 08:57
-
-
Save swshan/4b7562d09a64da59e1a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import sys | |
import re | |
import requests | |
import gevent | |
from bs4 import BeautifulSoup | |
import urlparse | |
import time | |
import json | |
reload(sys) # reload 才能调用 setdefaultencoding 方法 | |
sys.setdefaultencoding('utf-8') # 设置 'utf-8' | |
global header_info | |
header_info = { | |
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36' | |
} | |
# root_url = 'http://wap.douban.com' | |
def crawler(list_url): | |
''' ''' | |
lele_url = "http://m.leleketang.com/lib/%s.shtml" % list_url | |
r = requests.get(lele_url, timeout = 0.2, headers=header_info) | |
print lele_url | |
print "http status_code is " + str(r.status_code) | |
''' soup extract html ''' | |
soup = BeautifulSoup(r.text, 'html.parser',from_encoding="utf-8" ) | |
question = soup.findAll('div', attrs={'class':"uc_q"}) | |
print "beautifulsoup " + str(soup.original_encoding) | |
raw_answer = soup.findAll('li', attrs={'class':" ucqo_g_solution"}) | |
''' dict str convert ''' | |
question = str(question) | |
raw_answer = str(raw_answer) | |
#print extract_2 | |
""" to dict """ | |
newdict = {"Question": question, | |
"Answer": raw_answer,} | |
print type(newdict) | |
print newdict | |
''' | |
repr(extract_2) | |
json_data_2 = json.dumps(extract_2) | |
print json_data_2 | |
''' | |
try: | |
with open("%s.json" % list_url, "wb") as f: | |
json.dump(newdict, f, indent=4, encoding="UTF-8", ensure_ascii=False) | |
except IOError: | |
print("Oops, file error...") | |
def do(counts): | |
''' | |
counts = range(493174, 493183) | |
for list_url in counts: | |
print list_url | |
crawler(list_url) | |
''' | |
counts = 493174 | |
list_url = 493174 | |
crawler(list_url) | |
gevent.joinall([ | |
gevent.spawn(do,"counts") | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment