Skip to content

Instantly share code, notes, and snippets.

@swshan
Last active November 11, 2015 03:29
Show Gist options
  • Save swshan/7a9d358f30bbdd4b720a to your computer and use it in GitHub Desktop.
Save swshan/7a9d358f30bbdd4b720a to your computer and use it in GitHub Desktop.
modify at the date of 11.11, for purpose of Coupling
#-*- coding:utf-8 -*-
"""
Date 15-11-11 lastest progres
"""
import time
import sys
import re
import gevent
from BeautifulSoup import BeautifulSoup
import urlparse
import urllib2
from urllib2 import URLError
import codecs
import json
from gevent import monkey
global header_info
header_info = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36'
}
def get_url_response(url):
try:
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')
resp = urllib2.urlopen(request)
except urllib2.URLError as e:
print e.reason
return
return resp
def parse_question(html):
""" func work """
soup = BeautifulSoup(html)
try:
question = soup.findAll('div', {'class':"uc_q"})
except:
""" for fill question class """
question = soup.findAll('div', {'class':"uc_q_caption"})
print "parse_ques done"
print question
return question
def parse_question_answer(html):
soup = BeautifulSoup(html)
try:
find_answer = soup.findAll('li', {'class':" ucqo_g_solution"}) # must have had spaces
except IndexError:
find_answer = soup.findAll('span', {'class':"ucqo_g_solution"})
print "FIND ANSWER ", find_answer
return find_answer
def parse_question_otherinfo(r):
soup = BeautifulSoup(r)
source = soup.findAll('span', {'class':"ellipsis"})
print "parse_question_otherinfo done!"
#print source
return source
def parse_question_explained(html):
soup = BeautifulSoup(html)
htmlfind = soup.findAll('div', {'class':"question_part_content"})
# print "HTML question explained \n", htmlfind
html_section = htmlfind[1] # must be [1]
question_info = {}
index = 0
tag_div_list = html_section.findAll('div', attrs={"class": "clearfix"})
print len(tag_div_list)
# 1 traceback at here
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
question_info['answer'] = tags[0].text
question_info['content'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text
# 2
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
question_info['s1'] = tags[0].text
question_info['c2'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text
# 3
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
question_info['d1'] = tags[0].text
question_info['f2'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
return question_info
def crawler_one_page(lele_url):
response = get_url_response(lele_url)
print "HTTP code is ",response.code
if response.code != 200:
return "END: %s" % question_url # change unknown
page_content = response.read()
question = parse_question(page_content)
answer = parse_question_answer(page_content)
print "html answer", answer
source = parse_question_otherinfo(page_content)
question_info = parse_question_explained(page_content)
#print question_info
''' dict str convert '''
question_text = question[0].text
#if raw_answer:
answer_text = answer[0].text
source_text = source[0].text
""" variables to dict """
dict_to_json = {"Question": question_text,
"dianpin": question_info,
"source": source_text,
"answer": answer_text,
}
return dict_to_json
def dict_write_fo_json_file(question_url, newdict):
try:
with codecs.open("%s.json" % question_url, "wb",'utf-8') as f:
json.dump(newdict, f, indent=2, ensure_ascii=False)
except IOError:
print("Oops, file error...")
def main(counts):
counts = xrange(493175,493177)
for question_url in counts:
print question_url
lele_url = "http://wap.douban.com" % question_url
dict_to_json = crawler_one_page(lele_url)
dict_write_fo_json_file(question_url, dict_to_json)
if __name__ == '__main__':
main(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment