swshan · November 25, 2015 01:44
diff --git a/crawler.r.2.6.py b/crawler.r.2.6.py
 #-*- coding:utf-8 -*-

 """ 
 Date 15-11-11 lastest progres
 """

 import time
 import sys 
 import re

 from bs4 import BeautifulSoup
 import urlparse
 import urllib2
 import requests
 from urllib2 import URLError
 import codecs
 import json

 def find_page(html_page):
    
    question_url = "http://www.leleketang.com/lib/"

    soup = BeautifulSoup(html_page)
    
    #html = soup.find_all('a')
    # May need a list
    url_list = []
    for link in soup.find_all('a', {"class": "to_view"}):
        var = urlparse.urljoin(question_url, link.get('href'))
        url_list.append(var)

        #print url_list
    print (url_list)
    return url_list

 def get_url_response(url):
    try:
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')
        resp = urllib2.urlopen(request)
    except urllib2.HTTPError as e:
        print e.code              # http status code
        print e.reason
        return

    print "HTTP code is ",resp.code
    if resp.code != 200:
        return "END: %s" % url # change unknown

    return resp


 def crawler_one_page(dig_url):

    response = get_url_response(dig_url)
    
    print "digging", dig_url
    print "Question, http code ",response.code

    if response.code != 200:
        return "END: %s" % dig_url # change unknown
    
    page_content = response.read()

    question = parse_question(page_content)
    answer = parse_question_answer(page_content)
    print "html answer", answer

    source = parse_question_otherinfo(page_content)
    question_info = parse_question_explained(page_content)

    ''' parse question over and then produce dict '''
    
    question_text = question[0].text

    #if answer:
    #    answer_text = answer[0].text
    
    source_text = source[0].text

    if not answer:

        ''' [ Exception handle ] yet case , such as explain case '''
    
        dict_to_json = {"Question": question_text,
               "dianpin": question_info,
               "source": source_text,
               
               }

    else:
        answer_text = answer[0].text

        dict_to_json = {"Question": question_text,
               "dianpin": question_info,
               "source": source_text,
               "answer": answer_text,
               }


    return dict_to_json

 def dict_write_fo_json_file(question_url, newdict):
    try:
        with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
            json.dump(newdict, f, indent=2, ensure_ascii=False)
    except IOError:
        print("Oops, file error...")


 def parse_question(html):
    """ func work """
    soup = BeautifulSoup(html)

    
    #question = soup.findAll('div', {'class':"uc_q"})
    
    """ for fill question class """
    question = soup.findAll('div', {'class':"uc_q_caption"})
    
    print "parse_ques done"
    #print  question
    return question

 def parse_question_answer(html):

    soup = BeautifulSoup(html)
    
    print "HTML with answer "
    #print soup
    print
    
    find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
    ''' for yet another case '''
    if find_answer is None:
        find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})
    
    print "FIND ANSWER ", find_answer
    return find_answer

 def parse_question_otherinfo(r):

    soup = BeautifulSoup(r)
    source = soup.findAll('span', {'class':"ellipsis"})
    
    print "parse_question_otherinfo done!"
    #print source

    return source

 def parse_question_explained(html):
    
    soup = BeautifulSoup(html)
    
    htmlfind = soup.findAll('div', {'class':"question_part_content"})
    # print "HTML question explained \n", htmlfind

    html_section = htmlfind[1] # must be [1]

    question_info = {}

    index = 0
    tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
    print len(tag_div_list)

    # 1 traceback at here
    tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
    question_info['answer'] = tags[0].text
    question_info['content'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text

    # 2
    tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
    question_info['s1'] = tags[0].text
    question_info['c2'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text

    # 3
    tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
    question_info['d1'] = tags[0].text
    question_info['f2'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
    
    #print question_info
    return question_info

 def dict_write_fo_json_file(question_url, newdict):

    try:
        
        with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
            json.dump(newdict, f, indent=2, ensure_ascii=False)
    except IOError:
        print("Oops, file error...")


 def main():
   

    """ second level may need twice loop """
    
    """ list html iter """
    for visit_page in xrange(1, 3):
        visit_page

        response = get_url_response(url)
        page = response.read()
        #print page 

        found_list_url = find_page(page)
    
        for question_url in found_list_url:
            dict_variable = crawler_one_page(question_url)
            var = dict_write_fo_json_file(question_url, dict_variable)


 if __name__ == '__main__':
    main()
	#-- coding:utf-8 --

	"""
	Date 15-11-11 lastest progres
	"""

	import time
	import sys
	import re

	from bs4 import BeautifulSoup
	import urlparse
	import urllib2
	import requests
	from urllib2 import URLError
	import codecs
	import json

	def find_page(html_page):

	question_url = "http://www.leleketang.com/lib/"

	soup = BeautifulSoup(html_page)

	#html = soup.find_all('a')
	# May need a list
	url_list = []
	for link in soup.find_all('a', {"class": "to_view"}):
	var = urlparse.urljoin(question_url, link.get('href'))
	url_list.append(var)

	#print url_list
	print (url_list)
	return url_list

	def get_url_response(url):
	try:
	request = urllib2.Request(url)
	request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')
	resp = urllib2.urlopen(request)
	except urllib2.HTTPError as e:
	print e.code # http status code
	print e.reason
	return

	print "HTTP code is ",resp.code
	if resp.code != 200:
	return "END: %s" % url # change unknown

	return resp


	def crawler_one_page(dig_url):

	response = get_url_response(dig_url)

	print "digging", dig_url
	print "Question, http code ",response.code

	if response.code != 200:
	return "END: %s" % dig_url # change unknown

	page_content = response.read()

	question = parse_question(page_content)
	answer = parse_question_answer(page_content)
	print "html answer", answer

	source = parse_question_otherinfo(page_content)
	question_info = parse_question_explained(page_content)

	''' parse question over and then produce dict '''

	question_text = question[0].text

	#if answer:
	# answer_text = answer[0].text

	source_text = source[0].text

	if not answer:

	''' [ Exception handle ] yet case , such as explain case '''

	dict_to_json = {"Question": question_text,
	"dianpin": question_info,
	"source": source_text,

	}

	else:
	answer_text = answer[0].text

	dict_to_json = {"Question": question_text,
	"dianpin": question_info,
	"source": source_text,
	"answer": answer_text,
	}


	return dict_to_json

	def dict_write_fo_json_file(question_url, newdict):
	try:
	with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
	json.dump(newdict, f, indent=2, ensure_ascii=False)
	except IOError:
	print("Oops, file error...")


	def parse_question(html):
	""" func work """
	soup = BeautifulSoup(html)


	#question = soup.findAll('div', {'class':"uc_q"})

	""" for fill question class """
	question = soup.findAll('div', {'class':"uc_q_caption"})

	print "parse_ques done"
	#print question
	return question

	def parse_question_answer(html):

	soup = BeautifulSoup(html)

	print "HTML with answer "
	#print soup
	print

	find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
	''' for yet another case '''
	if find_answer is None:
	find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})

	print "FIND ANSWER ", find_answer
	return find_answer

	def parse_question_otherinfo(r):

	soup = BeautifulSoup(r)
	source = soup.findAll('span', {'class':"ellipsis"})

	print "parse_question_otherinfo done!"
	#print source

	return source

	def parse_question_explained(html):

	soup = BeautifulSoup(html)

	htmlfind = soup.findAll('div', {'class':"question_part_content"})
	# print "HTML question explained \n", htmlfind

	html_section = htmlfind[1] # must be [1]

	question_info = {}

	index = 0
	tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
	print len(tag_div_list)

	# 1 traceback at here
	tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
	question_info['answer'] = tags[0].text
	question_info['content'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text

	# 2
	tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
	question_info['s1'] = tags[0].text
	question_info['c2'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text

	# 3
	tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
	question_info['d1'] = tags[0].text
	question_info['f2'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text

	#print question_info
	return question_info

	def dict_write_fo_json_file(question_url, newdict):

	try:

	with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
	json.dump(newdict, f, indent=2, ensure_ascii=False)
	except IOError:
	print("Oops, file error...")


	def main():


	""" second level may need twice loop """

	""" list html iter """
	for visit_page in xrange(1, 3):
	visit_page

	response = get_url_response(url)
	page = response.read()
	#print page

	found_list_url = find_page(page)

	for question_url in found_list_url:
	dict_variable = crawler_one_page(question_url)
	var = dict_write_fo_json_file(question_url, dict_variable)


	if __name__ == '__main__':
	main()