Created
November 25, 2015 01:44
-
-
Save swshan/06aa85b6a050addb82c4 to your computer and use it in GitHub Desktop.
data grab script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
""" | |
Date 15-11-11 lastest progres | |
""" | |
import time | |
import sys | |
import re | |
from bs4 import BeautifulSoup | |
import urlparse | |
import urllib2 | |
import requests | |
from urllib2 import URLError | |
import codecs | |
import json | |
def find_page(html_page): | |
question_url = "http://www.leleketang.com/lib/" | |
soup = BeautifulSoup(html_page) | |
#html = soup.find_all('a') | |
# May need a list | |
url_list = [] | |
for link in soup.find_all('a', {"class": "to_view"}): | |
var = urlparse.urljoin(question_url, link.get('href')) | |
url_list.append(var) | |
#print url_list | |
print (url_list) | |
return url_list | |
def get_url_response(url): | |
try: | |
request = urllib2.Request(url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36') | |
resp = urllib2.urlopen(request) | |
except urllib2.HTTPError as e: | |
print e.code # http status code | |
print e.reason | |
return | |
print "HTTP code is ",resp.code | |
if resp.code != 200: | |
return "END: %s" % url # change unknown | |
return resp | |
def crawler_one_page(dig_url): | |
response = get_url_response(dig_url) | |
print "digging", dig_url | |
print "Question, http code ",response.code | |
if response.code != 200: | |
return "END: %s" % dig_url # change unknown | |
page_content = response.read() | |
question = parse_question(page_content) | |
answer = parse_question_answer(page_content) | |
print "html answer", answer | |
source = parse_question_otherinfo(page_content) | |
question_info = parse_question_explained(page_content) | |
''' parse question over and then produce dict ''' | |
question_text = question[0].text | |
#if answer: | |
# answer_text = answer[0].text | |
source_text = source[0].text | |
if not answer: | |
''' [ Exception handle ] yet case , such as explain case ''' | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
} | |
else: | |
answer_text = answer[0].text | |
dict_to_json = {"Question": question_text, | |
"dianpin": question_info, | |
"source": source_text, | |
"answer": answer_text, | |
} | |
return dict_to_json | |
def dict_write_fo_json_file(question_url, newdict): | |
try: | |
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print("Oops, file error...") | |
def parse_question(html): | |
""" func work """ | |
soup = BeautifulSoup(html) | |
#question = soup.findAll('div', {'class':"uc_q"}) | |
""" for fill question class """ | |
question = soup.findAll('div', {'class':"uc_q_caption"}) | |
print "parse_ques done" | |
#print question | |
return question | |
def parse_question_answer(html): | |
soup = BeautifulSoup(html) | |
print "HTML with answer " | |
#print soup | |
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces | |
''' for yet another case ''' | |
if find_answer is None: | |
find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"}) | |
print "FIND ANSWER ", find_answer | |
return find_answer | |
def parse_question_otherinfo(r): | |
soup = BeautifulSoup(r) | |
source = soup.findAll('span', {'class':"ellipsis"}) | |
print "parse_question_otherinfo done!" | |
#print source | |
return source | |
def parse_question_explained(html): | |
soup = BeautifulSoup(html) | |
htmlfind = soup.findAll('div', {'class':"question_part_content"}) | |
# print "HTML question explained \n", htmlfind | |
html_section = htmlfind[1] # must be [1] | |
question_info = {} | |
index = 0 | |
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"}) | |
print len(tag_div_list) | |
# 1 traceback at here | |
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
question_info['answer'] = tags[0].text | |
question_info['content'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
# 2 | |
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
question_info['s1'] = tags[0].text | |
question_info['c2'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
# 3 | |
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
question_info['d1'] = tags[0].text | |
question_info['f2'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
#print question_info | |
return question_info | |
def dict_write_fo_json_file(question_url, newdict): | |
try: | |
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
except IOError: | |
print("Oops, file error...") | |
def main(): | |
""" second level may need twice loop """ | |
""" list html iter """ | |
for visit_page in xrange(1, 3): | |
visit_page | |
response = get_url_response(url) | |
page = response.read() | |
#print page | |
found_list_url = find_page(page) | |
for question_url in found_list_url: | |
dict_variable = crawler_one_page(question_url) | |
var = dict_write_fo_json_file(question_url, dict_variable) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment