Skip to content

Instantly share code, notes, and snippets.

@swshan
Created November 25, 2015 01:44
Show Gist options
  • Save swshan/06aa85b6a050addb82c4 to your computer and use it in GitHub Desktop.
Save swshan/06aa85b6a050addb82c4 to your computer and use it in GitHub Desktop.
data grab script
#-*- coding:utf-8 -*-
"""
Date 15-11-11 lastest progres
"""
import time
import sys
import re
from bs4 import BeautifulSoup
import urlparse
import urllib2
import requests
from urllib2 import URLError
import codecs
import json
def find_page(html_page):
question_url = "http://www.leleketang.com/lib/"
soup = BeautifulSoup(html_page)
#html = soup.find_all('a')
# May need a list
url_list = []
for link in soup.find_all('a', {"class": "to_view"}):
var = urlparse.urljoin(question_url, link.get('href'))
url_list.append(var)
#print url_list
print (url_list)
return url_list
def get_url_response(url):
try:
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')
resp = urllib2.urlopen(request)
except urllib2.HTTPError as e:
print e.code # http status code
print e.reason
return
print "HTTP code is ",resp.code
if resp.code != 200:
return "END: %s" % url # change unknown
return resp
def crawler_one_page(dig_url):
response = get_url_response(dig_url)
print "digging", dig_url
print "Question, http code ",response.code
if response.code != 200:
return "END: %s" % dig_url # change unknown
page_content = response.read()
question = parse_question(page_content)
answer = parse_question_answer(page_content)
print "html answer", answer
source = parse_question_otherinfo(page_content)
question_info = parse_question_explained(page_content)
''' parse question over and then produce dict '''
question_text = question[0].text
#if answer:
# answer_text = answer[0].text
source_text = source[0].text
if not answer:
''' [ Exception handle ] yet case , such as explain case '''
dict_to_json = {"Question": question_text,
"dianpin": question_info,
"source": source_text,
}
else:
answer_text = answer[0].text
dict_to_json = {"Question": question_text,
"dianpin": question_info,
"source": source_text,
"answer": answer_text,
}
return dict_to_json
def dict_write_fo_json_file(question_url, newdict):
try:
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
json.dump(newdict, f, indent=2, ensure_ascii=False)
except IOError:
print("Oops, file error...")
def parse_question(html):
""" func work """
soup = BeautifulSoup(html)
#question = soup.findAll('div', {'class':"uc_q"})
""" for fill question class """
question = soup.findAll('div', {'class':"uc_q_caption"})
print "parse_ques done"
#print question
return question
def parse_question_answer(html):
soup = BeautifulSoup(html)
print "HTML with answer "
#print soup
print
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
''' for yet another case '''
if find_answer is None:
find_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})
print "FIND ANSWER ", find_answer
return find_answer
def parse_question_otherinfo(r):
soup = BeautifulSoup(r)
source = soup.findAll('span', {'class':"ellipsis"})
print "parse_question_otherinfo done!"
#print source
return source
def parse_question_explained(html):
soup = BeautifulSoup(html)
htmlfind = soup.findAll('div', {'class':"question_part_content"})
# print "HTML question explained \n", htmlfind
html_section = htmlfind[1] # must be [1]
question_info = {}
index = 0
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
print len(tag_div_list)
# 1 traceback at here
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
question_info['answer'] = tags[0].text
question_info['content'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text
# 2
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
question_info['s1'] = tags[0].text
question_info['c2'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text
# 3
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
question_info['d1'] = tags[0].text
question_info['f2'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
#print question_info
return question_info
def dict_write_fo_json_file(question_url, newdict):
try:
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
json.dump(newdict, f, indent=2, ensure_ascii=False)
except IOError:
print("Oops, file error...")
def main():
""" second level may need twice loop """
""" list html iter """
for visit_page in xrange(1, 3):
visit_page
response = get_url_response(url)
page = response.read()
#print page
found_list_url = find_page(page)
for question_url in found_list_url:
dict_variable = crawler_one_page(question_url)
var = dict_write_fo_json_file(question_url, dict_variable)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment