Skip to content

Instantly share code, notes, and snippets.

@swshan
Created November 8, 2015 09:28
Show Gist options
  • Save swshan/1bc8a0eaab95c0eb4178 to your computer and use it in GitHub Desktop.
Save swshan/1bc8a0eaab95c0eb4178 to your computer and use it in GitHub Desktop.
数据抓取 的小改进版本 脚本 主要改了 gevent spawn
#-*- coding:utf-8 -*-
''' For avoiding keyerror '''
from gevent import monkey
monkey.patch_all()
import gevent
import sys
import re
import requests
from bs4 import BeautifulSoup
import urlparse
import urllib2
import codecs
import json
#import pymysql.cursors
#import tomdb
#db = tomdb.Connection("localhost", "db", "root", "password", True, use_charset='utf8mb4')
global headers
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2543.0 Safari/537.36'
}
# root_url = 'http://wap.douban.com'
def parse_question(s):
question_info = {}
index = 0
tag_div_list = s.findAll('div', attrs={"class": "clearfix"})
print len(tag_div_list)
"""
for tag_div in tag_div_list:
tags = tag_div.findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
if tags:
print index
# print 'tag: ', tags[0].text, 'content:', tag_div.findAll('span', attrs={'class': 'uc_q_object'})[0].text
#question_info['tag_%s' % index] = tags[0].text
#question_info['content_%s'% index] = tag_div.findAll('span', attrs={'class': 'uc_q_object'})[0].text
index += 1
"""
# 1
tags = tag_div_list[0].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
#question_info['fenxi'] = tags[0].text
question_info['fenxi'] = tag_div_list[0].findAll('span', attrs={'class': 'uc_q_object'})[0].text
# 2
tags = tag_div_list[1].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
#question_info['jieda'] = tags[0].text
question_info['jieda'] = tag_div_list[1].findAll('span', attrs={'class': 'uc_q_object'})[0].text
# 3
tags = tag_div_list[2].findAll('div', attrs={'style':"float:left;width:45px;background: #4cd1d4;color: #fff;text-align:center;margin-right:10px;border-radius: 3px;"})
#question_info['dianpin'] = tags[0].text
question_info['dianpin'] = tag_div_list[2].findAll('span', attrs={'class': 'uc_q_object'})[0].text
return question_info
def crawler_one_page(counts):
''' '''
lele_url = "http://www.leleketang.com/lib/%s.shtml" % counts
try:
'''
request = urllib2.Request(lele_url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36')
r = urllib2.urlopen(request).read()
'''
r = requests.get(lele_url, headers=headers, allow_redirects=False)
r.encoding = 'utf-8'
except Exception:
return False
#f = open("test.html", "a")
#f.write(r)
#f.close()
print lele_url
print r.status_code
''' soup extract html '''
soup = BeautifulSoup(r.text, "lxml")
title = soup.findAll('title')
question = soup.findAll('div', {'class':"uc_q_caption"})
""" for fill question class
question = soup.findAll('div', {'class':"uc_q_caption"})
"""
# raw_answer = soup.findAll('li', {'class':"ucqo_g_solution"})
raw_answer = soup.findAll('span', {'class':"uc_q_object ucqo_text ucqo_g_blank ucqo_g_solution"})
''' another soup '''
''' .question_part_box '''
#detail = soup.findAll('div', {'class':"question_part_content_item"})
detail = soup.findAll('div', {'class':"question_part_content"})
source = soup.findAll('span', {'class':"ellipsis"})
# print source
# print dir(detail)
print len(detail)
question_info = parse_question(detail[1])
#print question_info
''' dict str convert '''
title = title[0].text
q = question[0].text
# ans = raw_answer[0].text
sour = source[0].text
""" to dict """
newdict = {"Question": q,
"Title": title,
"explanation": question_info,
"source": sour,
#"Answer": ans,
}
return newdict
def write_fo_file(counts, newdict):
try:
with codecs.open("%s.json" % counts, "wb",'utf-8') as f:
json.dump(newdict, f, indent=2, ensure_ascii=False)
print "JSON done!"
except IOError:
print("Oops, file error...")
def main(counts):
#print list_url
newdict = crawler_one_page(counts)
write_fo_file(counts, newdict)
print "Finished!"
if __name__ == '__main__':
threads = [gevent.spawn(main, counts) \
for counts in xrange(16034465, 16034466)] # p189 198
gevent.joinall(threads)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment