Skip to content

Instantly share code, notes, and snippets.

@swshan
Last active January 13, 2016 04:41
Show Gist options
  • Save swshan/8e3b939e3bf488907e9f to your computer and use it in GitHub Desktop.
Save swshan/8e3b939e3bf488907e9f to your computer and use it in GitHub Desktop.
my task data crawler reversion 10, add reds
# -*- coding:utf-8 -*-
"""
Date 16-Jan-3q lastest progres
"""
#from __future__ import print_function
import gevent
from gevent import monkey
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
import time
import sys
import re
import random
from bs4 import BeautifulSoup
import urlparse
#import urllib2
#import grequests
import requests
#from urllib2 import URLError
import codecs
import simplejson as json
import redis
cache = redis.StrictRedis(host='localhost', port=6379, db=0)
def find_page(html_page):
question_url = "http://www.leleketang.com/lib/"
soup = BeautifulSoup(html_page, "html5lib")
#html = soup.find_all('a')
# May need a list
url_list = []
for link in soup.find_all('a', {"class": "to_view"}):
var = urlparse.urljoin(question_url, link.get('href'))
url_list.append(var)
#print url_list
print (url_list)
return url_list
def get_url_response(url):
#try:
request = urllib2.Request(url)
request.add_header('Referer', "https://www.google.com.hk/#newwindow=1&safe=strict&q=%E4%B9%90%E4%B9%90%E8%AF%BE%E5%A0%82" , 'User-Agent', 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36 ' )
resp = urllib2.urlopen(request)
#except urllib2.HTTPError as e:
# print "get url func http status code wrong", e.code # http status code
# print e.reason
# return
#print "HTTP code is ", resp.code
if resp.code != 200:
return "END: %s" % url # change unknown
#return resp
def get_requests(url):
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'}
r = requests.get(url, headers=headers)
r.encoding = 'utf-8'
if r.status_code != 200:
print r.status_code
print " +++++++++++++ http status error END %s" % url
if r.status_code == 504:
time.sleep(8)
r = requests.get(url, headers=headers)
if r.status_code == 302:
print ' >>> http 3xx'
return r.text
def crawler_one_page(dig_url):
print 'in crawler_one_page function'
a = dig_url
url = a.split('/')[-1].split('.')[0]
if cache.exists(url) == True:
print 'url exist'
return
else:
cache.set(url, dig_url)
print 'crawler_one_page function working '
try:
response = get_requests(dig_url)
except:
print 'func get_requests not working, may be networking or http error '
print
print 'dig_url'
# print "Question, http code ",response.code
# add exception handle logic
if response: page_content = response
else: return "http status error END "
question = parse_question(page_content)
answer = parse_question_answer(page_content)
# print ("html answer", answer)
source = parse_question_otherinfo(page_content)
question_info = parse_question_explained(page_content)
''' parse question over and then produce dict '''
""" Somethimes something wrong here with empty list """
try:
question_text = question[0].text
except:
print " question body not found, must be something wrong "
return
#if answer:
# answer_text = answer[0].text
source_text = source[0].text
if not answer:
''' [ Exception handle ] yet case , such as explain case '''
dict_to_json = {"Question": question_text,
"dianpin": question_info,
"source": source_text,
}
else:
answer_text = answer[0].text
dict_to_json = {"Question": question_text,
"dianpin": question_info,
"source": source_text,
"answer": answer_text,
}
return dict_to_json
def dict_write_fo_json_file(question_url, newdict):
try:
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
json.dump(newdict, f, indent=2, ensure_ascii=False)
except IOError:
print("Oops, file error...")
def parse_question(html):
""" func work """
soup = BeautifulSoup(html, "html5lib")
#question = soup.findAll('div', {'class':"uc_q"})
""" for fill question class """
question = soup.findAll('div', {'class':"uc_q_caption"})
#question = soup.title
#print (question)
#if not question:
# gevent.sleep(30)
# parse_question(html)
# return " Recursion call exit"
print ("parse_ques done")
#print question
return question
def parse_question_answer(html):
soup = BeautifulSoup(html, "html5lib")
print ("HTML with answer ")
#print soup
#print
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
''' for yet another case '''
if not find_answer:
find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})
#print ("FIND ANSWER ", find_answer)
return find_answer
def parse_question_otherinfo(r):
soup = BeautifulSoup(r, "html5lib")
source = soup.findAll('span', {'class':"ellipsis"})
print ("parse_question_otherinfo done!")
#print source
return source
def parse_question_explained(html):
soup = BeautifulSoup(html, "html5lib")
htmlfind = soup.findAll('div', {'class':"question_part_content"})
# print "HTML question explained \n", htmlfind
if htmlfind:
html_section = htmlfind[1] # must be [1]
else: return
question_info = {}
index = 0
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
print
print ("func html explain len", len(tag_div_list))
print ("write json and then next")
# 1 traceback at here
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
if tags[0]:
question_info['fenxi_tag'] = tags[0].text
else: pass
try:
question_info['dianping_neirong'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text
except IndexError:
pass
# 2
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
if tags:
question_info['jieda_tag'] = tags[0].text
else: pass
try: # not empty
question_info['jieda_neirong'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text
except:
pass
# 3
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
if tags:
question_info['dianping_tag'] = tags[0].text
else: pass
try:
question_info['fenxi_neirong'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
except:
pass
return question_info
def dict_write_fo_json_file(question_url, newdict):
a = question_url
url = a.split('/')[-1].split('.')[0]
try:
with codecs.open("%s.json" % url, "wb",'utf-8') as f:
json.dump(newdict, f, indent=2, ensure_ascii=False)
except IOError:
print(" +++++++++++++++++++ Oops, file error...")
def main():
""" second level may need twice loop """
""" list html iter """
for visit_page in range(300, 301): #xz 550 done
' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-1-%s.shtml" % visit_page '
' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-4-%s.shtml" % visit_page '
url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-2-%s.shtml" % visit_page
""" url 1 tiankong 2:xuanze 4:jieda """
#time.sleep(5.10)
response = get_requests(url)
print ("working on page ", visit_page)
#if response.status_code != 200: # add status code case
# return "END: %s" % url # change unknown
if response: page = response
else: return
#print page
found_list_url = find_page(page)
if found_list_url:
for question_url in found_list_url:
#time.sleep(6.0)
time.sleep(random.randint(7, 9))
print (question_url)
try:
dict_variable = crawler_one_page(question_url)
if dict_variable:
var = dict_write_fo_json_file(question_url, dict_variable)
else:
print 'main - this may empty or exist'
except:
return "Main loop question digging something wrong but continue "
else:
return #page list empty
if __name__ == '__main__':
main()
'''
gevent.joinall([
gevent.spawn(main),
])
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment