Created
November 8, 2015 09:28
-
-
Save swshan/1bc8a0eaab95c0eb4178 to your computer and use it in GitHub Desktop.
数据抓取 的小改进版本 脚本 主要改了 gevent spawn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
''' For avoiding keyerror ''' | |
from gevent import monkey | |
monkey.patch_all() | |
import gevent | |
import sys | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import urlparse | |
import urllib2 | |
import codecs | |
import json | |
#import pymysql.cursors | |
#import tomdb | |
#db = tomdb.Connection("localhost", "db", "root", "password", True, use_charset='utf8mb4') | |
global headers | |
headers = { | |
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2543.0 Safari/537.36' | |
} | |
# root_url = 'http://wap.douban.com' | |
def parse_question(s): | |
question_info = {} | |
index = 0 | |
tag_div_list = s.findAll('div', attrs={"class": "clearfix"}) | |
print len(tag_div_list) | |
""" | |
for tag_div in tag_div_list: | |
tags = tag_div.findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
if tags: | |
print index | |
# print 'tag: ', tags[0].text, 'content:', tag_div.findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
#question_info['tag_%s' % index] = tags[0].text | |
#question_info['content_%s'% index] = tag_div.findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
index += 1 | |
""" | |
# 1 | |
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
#question_info['fenxi'] = tags[0].text | |
question_info['fenxi'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
# 2 | |
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
#question_info['jieda'] = tags[0].text | |
question_info['jieda'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
# 3 | |
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"}) | |
#question_info['dianpin'] = tags[0].text | |
question_info['dianpin'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text | |
return question_info | |
def crawler_one_page(counts): | |
''' ''' | |
lele_url = "http://www.leleketang.com/lib/%s.shtml" % counts | |
try: | |
''' | |
request = urllib2.Request(lele_url) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36') | |
r = urllib2.urlopen(request).read() | |
''' | |
r = requests.get(lele_url, headers=headers, allow_redirects=False) | |
r.encoding = 'utf-8' | |
except Exception: | |
return False | |
#f = open("test.html", "a") | |
#f.write(r) | |
#f.close() | |
print lele_url | |
print r.status_code | |
''' soup extract html ''' | |
soup = BeautifulSoup(r.text, "lxml") | |
title = soup.findAll('title') | |
question = soup.findAll('div', {'class':"uc_q_caption"}) | |
""" for fill question class | |
question = soup.findAll('div', {'class':"uc_q_caption"}) | |
""" | |
# raw_answer = soup.findAll('li', {'class':"ucqo_g_solution"}) | |
raw_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"}) | |
''' another soup ''' | |
''' .question_part_box ''' | |
#detail = soup.findAll('div', {'class':"question_part_content_item"}) | |
detail = soup.findAll('div', {'class':"question_part_content"}) | |
source = soup.findAll('span', {'class':"ellipsis"}) | |
# print source | |
# print dir(detail) | |
print len(detail) | |
question_info = parse_question(detail[1]) | |
#print question_info | |
''' dict str convert ''' | |
title = title[0].text | |
q = question[0].text | |
# ans = raw_answer[0].text | |
sour = source[0].text | |
""" to dict """ | |
newdict = {"Question": q, | |
"Title": title, | |
"explanation": question_info, | |
"source": sour, | |
#"Answer": ans, | |
} | |
return newdict | |
def write_fo_file(counts, newdict): | |
try: | |
with codecs.open("%s.json" % counts, "wb",'utf-8') as f: | |
json.dump(newdict, f, indent=2, ensure_ascii=False) | |
print "JSON done!" | |
except IOError: | |
print("Oops, file error...") | |
def main(counts): | |
#print list_url | |
newdict = crawler_one_page(counts) | |
write_fo_file(counts, newdict) | |
print "Finished!" | |
if __name__ == '__main__': | |
threads = [gevent.spawn(main, counts) \ | |
for counts in xrange(16034465, 16034466)] # p189 198 | |
gevent.joinall(threads) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment