Last active
November 11, 2015 03:29
-
-
Save swshan/7a9d358f30bbdd4b720a to your computer and use it in GitHub Desktop.
modify at the date of 11.11, for purpose of Coupling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
""" | |
Date 15-11-11 lastest progres | |
""" | |
import time | |
import sys | |
import re | |
import gevent | |
from BeautifulSoup import BeautifulSoup | |
import urlparse | |
import urllib2 | |
from urllib2 import URLError | |
import codecs | |
import json | |
from gevent import monkey | |
global header_info | |
header_info = { | |
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36' | |
} | |
def get_url_response(url): | |
try: | |
request = urllib2.Request(url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36') | |
resp = urllib2.urlopen(request) | |
except urllib2.URLError as e: | |
print e.reason | |
return | |
return resp | |
def parse_question(html): | |
""" func work """ | |
soup = BeautifulSoup(html) | |
try: | |
question = soup.findAll('div', {'class':"uc_q"}) | |
except: | |
""" for fill question class """ | |
question = soup.findAll('div', {'class':"uc_q_caption"}) | |
print "parse_ques done" | |
print question | |
return question | |
def parse_question_answer(html): | |
soup = BeautifulSoup(html) | |
try: | |
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces | |
except IndexError: | |
find_answer = soup.findAll('span', {'class':"ucqo_g_solution"}) | |
print "FIND ANSWER ", find_answer | |
return find_answer | |
def parse_question_otherinfo(r): | |
soup = BeautifulSoup(r) | |
source = soup.findAll('span', {'class':"ellipsis"}) | |
print "parse_question_otherinfo done!" | |
#print source | |
return source | |
def parse_question_explained(html): | |
soup = BeautifulSoup(html) | |
htmlfind = soup.findAll('div', {'class':"question_part_content"}) | |
# print "HTML question explained \n", htmlfind | |
html_section = htmlfind[1] # must be [1] | |
question_info = {} | |
index = 0 | |
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"}) | |
print len(tag_div_list) | |
# 1 traceback at here | |
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
question_info['answer'] = tags[0].text | |
question_info['content'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
# 2 | |
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
question_info['s1'] = tags[0].text | |
question_info['c2'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
# 3 | |
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
question_info['d1'] = tags[0].text | |
question_info['f2'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
return question_info | |
def crawler_one_page(lele_url): | |
response = get_url_response(lele_url) | |
print "HTTP code is ",response.code | |
if response.code != 200: | |
return "END: %s" % question_url # change unknown | |
page_content = response.read() | |
question = parse_question(page_content) | |
answer = parse_question_answer(page_content) | |
print "html answer", answer | |
source = parse_question_otherinfo(page_content) | |
question_info = parse_question_explained(page_content) | |
#print question_info | |
''' dict str convert ''' | |
question_text = question[0].text | |
#if raw_answer: | |
answer_text = answer[0].text | |
source_text = source[0].text | |
""" variables to dict """ | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
"answer": answer_text, | |
} | |
return dict_to_json | |
def dict_write_fo_json_file(question_url, newdict): | |
try: | |
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print("Oops, file error...") | |
def main(counts): | |
counts = xrange(493175,493177) | |
for question_url in counts: | |
print question_url | |
lele_url = "http://wap.douban.com" % question_url | |
dict_to_json = crawler_one_page(lele_url) | |
dict_write_fo_json_file(question_url, dict_to_json) | |
if __name__ == '__main__': | |
main(10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment