swshan · January 13, 2016 04:41
diff --git a/crawler.r10.py b/crawler.r10.py
 # -*- coding:utf-8 -*-

 """
 Date 16-Jan-3q   lastest progres
 """

 #from __future__ import print_function
 import gevent
 from gevent import monkey

 # patches stdlib (including socket and ssl modules) to cooperate with other greenlets
 monkey.patch_all()


 import time
 import sys
 import re
 import random


 from bs4 import BeautifulSoup
 import urlparse
 #import urllib2
 #import grequests
 import requests
 #from urllib2 import URLError
 import codecs
 import simplejson as json

 import redis



 cache = redis.StrictRedis(host='localhost', port=6379, db=0)

 def find_page(html_page):

    question_url = "http://www.leleketang.com/lib/"

    soup = BeautifulSoup(html_page, "html5lib")

    #html = soup.find_all('a')
    # May need a list
    url_list = []
    for link in soup.find_all('a', {"class": "to_view"}):
        var = urlparse.urljoin(question_url, link.get('href'))
        url_list.append(var)

        #print url_list
    print (url_list)
    return url_list

 def get_url_response(url):
    #try:
    request = urllib2.Request(url)
    request.add_header('Referer', "https://www.google.com.hk/#newwindow=1&safe=strict&q=%E4%B9%90%E4%B9%90%E8%AF%BE%E5%A0%82" , 'User-Agent', 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36 ' )
    resp = urllib2.urlopen(request)
    #except urllib2.HTTPError as e:
    #    print "get url func http status code wrong", e.code              # http status code
    #    print e.reason
    #    return

    #print "HTTP code is ", resp.code
    if resp.code != 200:
        return "END: %s" % url # change unknown

    #return resp

 def get_requests(url):

    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'}

    r = requests.get(url, headers=headers)
    r.encoding = 'utf-8'

    if r.status_code != 200:
        print r.status_code
        print " +++++++++++++ http status error END %s" % url
        
        if r.status_code == 504:
            time.sleep(8)
            r = requests.get(url, headers=headers)
        if r.status_code == 302:
            print ' >>> http 3xx'

    return r.text


 def crawler_one_page(dig_url):

    print 'in crawler_one_page function'

    a = dig_url
    url = a.split('/')[-1].split('.')[0]

    if cache.exists(url) == True:
        print 'url exist'
        return
    else:
        cache.set(url, dig_url)

    print 'crawler_one_page function working '
    
    try:
        response = get_requests(dig_url)
    except:
        print 'func get_requests not working, may be networking or http error '

    
    print 
    print 'dig_url'
    # print "Question, http code ",response.code

    # add exception handle logic
    if response:  page_content = response
    else: return "http status error END "

    question = parse_question(page_content)
    answer = parse_question_answer(page_content)
    # print ("html answer", answer)

    source = parse_question_otherinfo(page_content)
    question_info = parse_question_explained(page_content)

    ''' parse question over and then produce dict '''
    """ Somethimes something wrong here with empty list """
    try:
        question_text = question[0].text
    except:
        print "  question body not found, must be something wrong  "
        return 

    #if answer:
    #    answer_text = answer[0].text

    source_text = source[0].text

    if not answer:

        ''' [ Exception handle ] yet case , such as explain case '''

        dict_to_json = {"Question": question_text,
               "dianpin": question_info,
               "source": source_text,

               }

    else:
        answer_text = answer[0].text

        dict_to_json = {"Question": question_text,
               "dianpin": question_info,
               "source": source_text,
               "answer": answer_text,
               }


    return dict_to_json

 def dict_write_fo_json_file(question_url, newdict):
    try:
        with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
            json.dump(newdict, f, indent=2, ensure_ascii=False)
    except IOError:
        print("Oops, file error...")


 def parse_question(html):
    """ func work """
    soup = BeautifulSoup(html, "html5lib")


    #question = soup.findAll('div', {'class':"uc_q"})

    """ for fill question class """
    question = soup.findAll('div', {'class':"uc_q_caption"})

    #question = soup.title

    #print (question)

    #if not question:
    #    gevent.sleep(30)
    #    parse_question(html)
    #    return " Recursion call exit"

    print ("parse_ques done")
    #print  question
    return question

 def parse_question_answer(html):

    soup = BeautifulSoup(html, "html5lib")

    print ("HTML with answer ")
    #print soup
    #print

    find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
    ''' for yet another case '''
    if not find_answer:
        find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})

    #print ("FIND ANSWER ", find_answer)
    return find_answer

 def parse_question_otherinfo(r):

    soup = BeautifulSoup(r, "html5lib")
    source = soup.findAll('span', {'class':"ellipsis"})

    print ("parse_question_otherinfo done!")
    #print source

    return source

 def parse_question_explained(html):

    soup = BeautifulSoup(html, "html5lib")

    htmlfind = soup.findAll('div', {'class':"question_part_content"})
    # print "HTML question explained \n", htmlfind

    if htmlfind:
        html_section = htmlfind[1] # must be [1]
    else: return

    question_info = {}

    index = 0
    tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
    print
    print ("func html explain len", len(tag_div_list))
    print ("write json and then next")

    # 1 traceback at here
    tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
    if tags[0]:
        question_info['fenxi_tag'] = tags[0].text
    else: pass

    try:
        question_info['dianping_neirong'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text
    except IndexError:
        pass
    # 2
    tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
    if tags:
        question_info['jieda_tag'] = tags[0].text
    else: pass

    try: # not empty
        question_info['jieda_neirong'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text
    except:
        pass
    # 3
    tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})

    if tags:
        question_info['dianping_tag'] = tags[0].text
    else: pass
    try:
        question_info['fenxi_neirong'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
    except:
        pass

    return question_info




 def dict_write_fo_json_file(question_url, newdict):

    a = question_url
    url = a.split('/')[-1].split('.')[0]
    try:

        with codecs.open("%s.json" % url, "wb",'utf-8') as f:
            json.dump(newdict, f, indent=2, ensure_ascii=False)
    except IOError:
        print(" +++++++++++++++++++ Oops, file error...")


 def main():


    """ second level may need twice loop """

    """ list html iter """
    for visit_page in range(300, 301): #xz 550 done
        ' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-1-%s.shtml" % visit_page '
        ' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-4-%s.shtml" % visit_page '
        url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-2-%s.shtml" % visit_page

        """ url 1 tiankong 2:xuanze 4:jieda """

        #time.sleep(5.10)
        response = get_requests(url)

        print ("working on page ", visit_page)

        #if response.status_code != 200:   # add status code case
        #    return "END: %s" % url # change unknown

        if response: page = response
        else:  return
        #print page

        found_list_url = find_page(page)

        if found_list_url:
        
            for question_url in found_list_url:

                #time.sleep(6.0)

                time.sleep(random.randint(7, 9))
                print (question_url)
                try:
                    dict_variable = crawler_one_page(question_url)
                    if dict_variable:
                        var = dict_write_fo_json_file(question_url, dict_variable)
                    else: 
                        print 'main - this may empty or exist'


                except:
                    return "Main loop question digging something wrong but continue "
        else:
            return #page list empty

 if __name__ == '__main__':
    main()

    '''
    gevent.joinall([
        gevent.spawn(main),

    ])

    '''
	# -- coding:utf-8 --

	"""
	Date 16-Jan-3q lastest progres
	"""

	#from __future__ import print_function
	import gevent
	from gevent import monkey

	# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
	monkey.patch_all()


	import time
	import sys
	import re
	import random


	from bs4 import BeautifulSoup
	import urlparse
	#import urllib2
	#import grequests
	import requests
	#from urllib2 import URLError
	import codecs
	import simplejson as json

	import redis



	cache = redis.StrictRedis(host='localhost', port=6379, db=0)

	def find_page(html_page):

	question_url = "http://www.leleketang.com/lib/"

	soup = BeautifulSoup(html_page, "html5lib")

	#html = soup.find_all('a')
	# May need a list
	url_list = []
	for link in soup.find_all('a', {"class": "to_view"}):
	var = urlparse.urljoin(question_url, link.get('href'))
	url_list.append(var)

	#print url_list
	print (url_list)
	return url_list

	def get_url_response(url):
	#try:
	request = urllib2.Request(url)
	request.add_header('Referer', "https://www.google.com.hk/#newwindow=1&safe=strict&q=%E4%B9%90%E4%B9%90%E8%AF%BE%E5%A0%82" , 'User-Agent', 'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36 ' )
	resp = urllib2.urlopen(request)
	#except urllib2.HTTPError as e:
	# print "get url func http status code wrong", e.code # http status code
	# print e.reason
	# return

	#print "HTTP code is ", resp.code
	if resp.code != 200:
	return "END: %s" % url # change unknown

	#return resp

	def get_requests(url):

	headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8) AppleWebKit/536.25 (KHTML, like Gecko) Version/6.0 Safari/536.25'}

	r = requests.get(url, headers=headers)
	r.encoding = 'utf-8'

	if r.status_code != 200:
	print r.status_code
	print " +++++++++++++ http status error END %s" % url

	if r.status_code == 504:
	time.sleep(8)
	r = requests.get(url, headers=headers)
	if r.status_code == 302:
	print ' >>> http 3xx'

	return r.text


	def crawler_one_page(dig_url):

	print 'in crawler_one_page function'

	a = dig_url
	url = a.split('/')[-1].split('.')[0]

	if cache.exists(url) == True:
	print 'url exist'
	return
	else:
	cache.set(url, dig_url)

	print 'crawler_one_page function working '

	try:
	response = get_requests(dig_url)
	except:
	print 'func get_requests not working, may be networking or http error '


	print
	print 'dig_url'
	# print "Question, http code ",response.code

	# add exception handle logic
	if response: page_content = response
	else: return "http status error END "

	question = parse_question(page_content)
	answer = parse_question_answer(page_content)
	# print ("html answer", answer)

	source = parse_question_otherinfo(page_content)
	question_info = parse_question_explained(page_content)

	''' parse question over and then produce dict '''
	""" Somethimes something wrong here with empty list """
	try:
	question_text = question[0].text
	except:
	print " question body not found, must be something wrong "
	return

	#if answer:
	# answer_text = answer[0].text

	source_text = source[0].text

	if not answer:

	''' [ Exception handle ] yet case , such as explain case '''

	dict_to_json = {"Question": question_text,
	"dianpin": question_info,
	"source": source_text,

	}

	else:
	answer_text = answer[0].text

	dict_to_json = {"Question": question_text,
	"dianpin": question_info,
	"source": source_text,
	"answer": answer_text,
	}


	return dict_to_json

	def dict_write_fo_json_file(question_url, newdict):
	try:
	with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
	json.dump(newdict, f, indent=2, ensure_ascii=False)
	except IOError:
	print("Oops, file error...")


	def parse_question(html):
	""" func work """
	soup = BeautifulSoup(html, "html5lib")


	#question = soup.findAll('div', {'class':"uc_q"})

	""" for fill question class """
	question = soup.findAll('div', {'class':"uc_q_caption"})

	#question = soup.title

	#print (question)

	#if not question:
	# gevent.sleep(30)
	# parse_question(html)
	# return " Recursion call exit"

	print ("parse_ques done")
	#print question
	return question

	def parse_question_answer(html):

	soup = BeautifulSoup(html, "html5lib")

	print ("HTML with answer ")
	#print soup
	#print

	find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
	''' for yet another case '''
	if not find_answer:
	find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})

	#print ("FIND ANSWER ", find_answer)
	return find_answer

	def parse_question_otherinfo(r):

	soup = BeautifulSoup(r, "html5lib")
	source = soup.findAll('span', {'class':"ellipsis"})

	print ("parse_question_otherinfo done!")
	#print source

	return source

	def parse_question_explained(html):

	soup = BeautifulSoup(html, "html5lib")

	htmlfind = soup.findAll('div', {'class':"question_part_content"})
	# print "HTML question explained \n", htmlfind

	if htmlfind:
	html_section = htmlfind[1] # must be [1]
	else: return

	question_info = {}

	index = 0
	tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
	print
	print ("func html explain len", len(tag_div_list))
	print ("write json and then next")

	# 1 traceback at here
	tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
	if tags[0]:
	question_info['fenxi_tag'] = tags[0].text
	else: pass

	try:
	question_info['dianping_neirong'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text
	except IndexError:
	pass
	# 2
	tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
	if tags:
	question_info['jieda_tag'] = tags[0].text
	else: pass

	try: # not empty
	question_info['jieda_neirong'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text
	except:
	pass
	# 3
	tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})

	if tags:
	question_info['dianping_tag'] = tags[0].text
	else: pass
	try:
	question_info['fenxi_neirong'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
	except:
	pass

	return question_info




	def dict_write_fo_json_file(question_url, newdict):

	a = question_url
	url = a.split('/')[-1].split('.')[0]
	try:

	with codecs.open("%s.json" % url, "wb",'utf-8') as f:
	json.dump(newdict, f, indent=2, ensure_ascii=False)
	except IOError:
	print(" +++++++++++++++++++ Oops, file error...")


	def main():


	""" second level may need twice loop """

	""" list html iter """
	for visit_page in range(300, 301): #xz 550 done
	' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-1-%s.shtml" % visit_page '
	' url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-4-%s.shtml" % visit_page '
	url = "http://www.leleketang.com/lib/list17-101-2-16-10129-113711-2-%s.shtml" % visit_page

	""" url 1 tiankong 2:xuanze 4:jieda """

	#time.sleep(5.10)
	response = get_requests(url)

	print ("working on page ", visit_page)

	#if response.status_code != 200: # add status code case
	# return "END: %s" % url # change unknown

	if response: page = response
	else: return
	#print page

	found_list_url = find_page(page)

	if found_list_url:

	for question_url in found_list_url:

	#time.sleep(6.0)

	time.sleep(random.randint(7, 9))
	print (question_url)
	try:
	dict_variable = crawler_one_page(question_url)
	if dict_variable:
	var = dict_write_fo_json_file(question_url, dict_variable)
	else:
	print 'main - this may empty or exist'


	except:
	return "Main loop question digging something wrong but continue "
	else:
	return #page list empty

	if __name__ == '__main__':
	main()

	'''
	gevent.joinall([
	gevent.spawn(main),

	])

	'''