Last active
January 13, 2016 04:41
-
-
Save swshan/533e196157df1f513a33 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
""" | |
Date 16-Jan-3q lastest progres | |
""" | |
#from __future__ import print_function | |
import gevent | |
from gevent import monkey | |
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets | |
monkey.patch_all() | |
import time | |
import sys | |
import re | |
import random | |
from bs4 import BeautifulSoup | |
import urlparse | |
#import urllib2 | |
#import grequests | |
import requests | |
#from urllib2 import URLError | |
import codecs | |
import simplejson as json | |
#from cache_requests import requests | |
def find_page(html_page): | |
question_url = "http://www.leleketang.com/lib/" | |
soup = BeautifulSoup(html_page, "html5lib") | |
#html = soup.find_all('a') | |
# May need a list | |
url_list = [] | |
for link in soup.find_all('a', {"class": "to_view"}): | |
var = urlparse.urljoin(question_url, link.get('href')) | |
url_list.append(var) | |
#print url_list | |
print (url_list) | |
return url_list | |
def get_url_response(url): | |
#try: | |
request = urllib2.Request(url) | |
request.add_header('Referer', "https://www.google.com.hk/#newwindow=1&safe=strict&q=%E4%B9%90%E4%B9%90%E8%AF%BE%E5%A0%82" , 'User-Agent', 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36 ' ) | |
resp = urllib2.urlopen(request) | |
#except urllib2.HTTPError as e: | |
# print "get url func http status code wrong", e.code # http status code | |
# print e.reason | |
# return | |
#print "HTTP code is ", resp.code | |
if resp.code != 200: | |
return "END: %s" % url # change unknown | |
#return resp | |
def get_requests(url): | |
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'} | |
r = requests.get(url, headers=headers) | |
r.encoding = 'utf-8' | |
if r.status_code != 200: | |
return " +++++++++++++ http status error END %s" % url | |
return r.text | |
def crawler_one_page(dig_url): | |
response = get_requests(dig_url) | |
print ('dig_url') | |
# print "Question, http code ",response.code | |
#if response.code != 200: | |
# return "END: %s" % dig_url # change unknown | |
# add exception handle logic | |
if response: page_content = response | |
else: return "http status error END " | |
question = parse_question(page_content) | |
answer = parse_question_answer(page_content) | |
# print ("html answer", answer) | |
source = parse_question_otherinfo(page_content) | |
question_info = parse_question_explained(page_content) | |
''' parse question over and then produce dict ''' | |
""" Somethimes something wrong here with empty list """ | |
try: | |
question_text = question[0].text | |
except: | |
print " question body not found, must be something wrong " | |
return | |
#if answer: | |
# answer_text = answer[0].text | |
source_text = source[0].text | |
if not answer: | |
''' [ Exception handle ] yet case , such as explain case ''' | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
} | |
else: | |
answer_text = answer[0].text | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
"answer": answer_text, | |
} | |
return dict_to_json | |
def dict_write_fo_json_file(question_url, newdict): | |
try: | |
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print("Oops, file error...") | |
def parse_question(html): | |
""" func work """ | |
soup = BeautifulSoup(html, "html5lib") | |
#question = soup.findAll('div', {'class':"uc_q"}) | |
""" for fill question class """ | |
question = soup.findAll('div', {'class':"uc_q_caption"}) | |
#question = soup.title | |
#print (question) | |
#if not question: | |
# gevent.sleep(30) | |
# parse_question(html) | |
# return " Recursion call exit" | |
print ("parse_ques done") | |
#print question | |
return question | |
def parse_question_answer(html): | |
soup = BeautifulSoup(html, "html5lib") | |
print ("HTML with answer ") | |
#print soup | |
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces | |
''' for yet another case ''' | |
if not find_answer: | |
find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"}) | |
#print ("FIND ANSWER ", find_answer) | |
return find_answer | |
def parse_question_otherinfo(r): | |
soup = BeautifulSoup(r, "html5lib") | |
source = soup.findAll('span', {'class':"ellipsis"}) | |
print ("parse_question_otherinfo done!") | |
#print source | |
return source | |
def parse_question_explained(html): | |
soup = BeautifulSoup(html, "html5lib") | |
htmlfind = soup.findAll('div', {'class':"question_part_content"}) | |
# print "HTML question explained \n", htmlfind | |
if htmlfind: | |
html_section = htmlfind[1] # must be [1] | |
else: return | |
question_info = {} | |
index = 0 | |
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"}) | |
print ("func html explain len", len(tag_div_list)) | |
print ("write json and then next") | |
# 1 traceback at here | |
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags[0]: | |
question_info['fenxi_tag'] = tags[0].text | |
else: pass | |
try: | |
question_info['dianping_neirong'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
except IndexError: | |
pass | |
# 2 | |
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags: | |
question_info['jieda_tag'] = tags[0].text | |
else: pass | |
try: # not empty | |
question_info['jieda_neirong'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
except: | |
pass | |
# 3 | |
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags: | |
question_info['dianping_tag'] = tags[0].text | |
else: pass | |
try: | |
question_info['fenxi_neirong'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
except: | |
pass | |
return question_info | |
def dict_write_fo_json_file(question_url, newdict): | |
a = question_url | |
url = a.split('/')[-1].split('.')[0] | |
try: | |
with codecs.open("%s.json" % url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print(" +++++++++++++++++++ Oops, file error...") | |
def main(): | |
""" second level may need twice loop """ | |
""" list html iter """ | |
for visit_page in range(700, 801): #xz 550 done | |
' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-1-%s.shtml" % visit_page ' | |
' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-4-%s.shtml" % visit_page ' | |
url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-2-%s.shtml" % visit_page | |
""" url 1 tiankong 2:xuanze 4:jieda """ | |
#time.sleep(5.10) | |
response = get_requests(url) | |
print ("working on page ", visit_page) | |
#if response.status_code != 200: # add status code case | |
# return "END: %s" % url # change unknown | |
if response: page = response | |
else: return | |
#print page | |
found_list_url = find_page(page) | |
if found_list_url: | |
for question_url in found_list_url: | |
#time.sleep(6.0) | |
time.sleep(random.randint(7, 13)) | |
print (question_url) | |
try: | |
dict_variable = crawler_one_page(question_url) | |
if dict_variable: | |
var = dict_write_fo_json_file(question_url, dict_variable) | |
else: return "http status error END" | |
except: | |
return "Main loop question digging something wrong but continue " | |
else: | |
return #page list empty | |
if __name__ == '__main__': | |
main() | |
''' | |
gevent.joinall([ | |
gevent.spawn(main), | |
]) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment