Last active
January 13, 2016 04:41
-
-
Save swshan/8e3b939e3bf488907e9f to your computer and use it in GitHub Desktop.
my task data crawler reversion 10, add reds
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
""" | |
Date 16-Jan-3q lastest progres | |
""" | |
#from __future__ import print_function | |
import gevent | |
from gevent import monkey | |
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets | |
monkey.patch_all() | |
import time | |
import sys | |
import re | |
import random | |
from bs4 import BeautifulSoup | |
import urlparse | |
#import urllib2 | |
#import grequests | |
import requests | |
#from urllib2 import URLError | |
import codecs | |
import simplejson as json | |
import redis | |
cache = redis.StrictRedis(host='localhost', port=6379, db=0) | |
def find_page(html_page): | |
question_url = "http://www.leleketang.com/lib/" | |
soup = BeautifulSoup(html_page, "html5lib") | |
#html = soup.find_all('a') | |
# May need a list | |
url_list = [] | |
for link in soup.find_all('a', {"class": "to_view"}): | |
var = urlparse.urljoin(question_url, link.get('href')) | |
url_list.append(var) | |
#print url_list | |
print (url_list) | |
return url_list | |
def get_url_response(url): | |
#try: | |
request = urllib2.Request(url) | |
request.add_header('Referer', "https://www.google.com.hk/#newwindow=1&safe=strict&q=%E4%B9%90%E4%B9%90%E8%AF%BE%E5%A0%82" , 'User-Agent', 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36 ' ) | |
resp = urllib2.urlopen(request) | |
#except urllib2.HTTPError as e: | |
# print "get url func http status code wrong", e.code # http status code | |
# print e.reason | |
# return | |
#print "HTTP code is ", resp.code | |
if resp.code != 200: | |
return "END: %s" % url # change unknown | |
#return resp | |
def get_requests(url): | |
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'} | |
r = requests.get(url, headers=headers) | |
r.encoding = 'utf-8' | |
if r.status_code != 200: | |
print r.status_code | |
print " +++++++++++++ http status error END %s" % url | |
if r.status_code == 504: | |
time.sleep(8) | |
r = requests.get(url, headers=headers) | |
if r.status_code == 302: | |
print ' >>> http 3xx' | |
return r.text | |
def crawler_one_page(dig_url): | |
print 'in crawler_one_page function' | |
a = dig_url | |
url = a.split('/')[-1].split('.')[0] | |
if cache.exists(url) == True: | |
print 'url exist' | |
return | |
else: | |
cache.set(url, dig_url) | |
print 'crawler_one_page function working ' | |
try: | |
response = get_requests(dig_url) | |
except: | |
print 'func get_requests not working, may be networking or http error ' | |
print 'dig_url' | |
# print "Question, http code ",response.code | |
# add exception handle logic | |
if response: page_content = response | |
else: return "http status error END " | |
question = parse_question(page_content) | |
answer = parse_question_answer(page_content) | |
# print ("html answer", answer) | |
source = parse_question_otherinfo(page_content) | |
question_info = parse_question_explained(page_content) | |
''' parse question over and then produce dict ''' | |
""" Somethimes something wrong here with empty list """ | |
try: | |
question_text = question[0].text | |
except: | |
print " question body not found, must be something wrong " | |
return | |
#if answer: | |
# answer_text = answer[0].text | |
source_text = source[0].text | |
if not answer: | |
''' [ Exception handle ] yet case , such as explain case ''' | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
} | |
else: | |
answer_text = answer[0].text | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
"answer": answer_text, | |
} | |
return dict_to_json | |
def dict_write_fo_json_file(question_url, newdict): | |
try: | |
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print("Oops, file error...") | |
def parse_question(html): | |
""" func work """ | |
soup = BeautifulSoup(html, "html5lib") | |
#question = soup.findAll('div', {'class':"uc_q"}) | |
""" for fill question class """ | |
question = soup.findAll('div', {'class':"uc_q_caption"}) | |
#question = soup.title | |
#print (question) | |
#if not question: | |
# gevent.sleep(30) | |
# parse_question(html) | |
# return " Recursion call exit" | |
print ("parse_ques done") | |
#print question | |
return question | |
def parse_question_answer(html): | |
soup = BeautifulSoup(html, "html5lib") | |
print ("HTML with answer ") | |
#print soup | |
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces | |
''' for yet another case ''' | |
if not find_answer: | |
find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"}) | |
#print ("FIND ANSWER ", find_answer) | |
return find_answer | |
def parse_question_otherinfo(r): | |
soup = BeautifulSoup(r, "html5lib") | |
source = soup.findAll('span', {'class':"ellipsis"}) | |
print ("parse_question_otherinfo done!") | |
#print source | |
return source | |
def parse_question_explained(html): | |
soup = BeautifulSoup(html, "html5lib") | |
htmlfind = soup.findAll('div', {'class':"question_part_content"}) | |
# print "HTML question explained \n", htmlfind | |
if htmlfind: | |
html_section = htmlfind[1] # must be [1] | |
else: return | |
question_info = {} | |
index = 0 | |
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"}) | |
print ("func html explain len", len(tag_div_list)) | |
print ("write json and then next") | |
# 1 traceback at here | |
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags[0]: | |
question_info['fenxi_tag'] = tags[0].text | |
else: pass | |
try: | |
question_info['dianping_neirong'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
except IndexError: | |
pass | |
# 2 | |
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags: | |
question_info['jieda_tag'] = tags[0].text | |
else: pass | |
try: # not empty | |
question_info['jieda_neirong'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
except: | |
pass | |
# 3 | |
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags: | |
question_info['dianping_tag'] = tags[0].text | |
else: pass | |
try: | |
question_info['fenxi_neirong'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
except: | |
pass | |
return question_info | |
def dict_write_fo_json_file(question_url, newdict): | |
a = question_url | |
url = a.split('/')[-1].split('.')[0] | |
try: | |
with codecs.open("%s.json" % url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print(" +++++++++++++++++++ Oops, file error...") | |
def main(): | |
""" second level may need twice loop """ | |
""" list html iter """ | |
for visit_page in range(300, 301): #xz 550 done | |
' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-1-%s.shtml" % visit_page ' | |
' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-4-%s.shtml" % visit_page ' | |
url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-2-%s.shtml" % visit_page | |
""" url 1 tiankong 2:xuanze 4:jieda """ | |
#time.sleep(5.10) | |
response = get_requests(url) | |
print ("working on page ", visit_page) | |
#if response.status_code != 200: # add status code case | |
# return "END: %s" % url # change unknown | |
if response: page = response | |
else: return | |
#print page | |
found_list_url = find_page(page) | |
if found_list_url: | |
for question_url in found_list_url: | |
#time.sleep(6.0) | |
time.sleep(random.randint(7, 9)) | |
print (question_url) | |
try: | |
dict_variable = crawler_one_page(question_url) | |
if dict_variable: | |
var = dict_write_fo_json_file(question_url, dict_variable) | |
else: | |
print 'main - this may empty or exist' | |
except: | |
return "Main loop question digging something wrong but continue " | |
else: | |
return #page list empty | |
if __name__ == '__main__': | |
main() | |
''' | |
gevent.joinall([ | |
gevent.spawn(main), | |
]) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment