Created
June 5, 2014 03:05
-
-
Save penkzhou/a657720be302f72269ca to your computer and use it in GitHub Desktop.
My web crawl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _*_ coding: utf-8 _*_ | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf8") | |
from gevent import monkey | |
monkey.patch_all() | |
import requests | |
import redis | |
import gevent | |
from gevent.pool import Pool | |
from bs4 import BeautifulSoup | |
import time | |
from pymongo import MongoClient, ReadPreference | |
import json | |
import redis.connection | |
redis.connection.socket = gevent.socket | |
mongo_connection = MongoClient( | |
'%s:%d' % ( | |
JobProjectConfiguration.save_mongo_host, | |
JobProjectConfiguration.save_mongo_port), | |
read_preference=ReadPreference.SECONDARY, | |
max_pool_size=10, use_greenlets=True) | |
mongo_db = mongo_connection.jobdigg | |
redis_connection = redis.ConnectionPool( | |
host=JobProjectConfiguration.url_queue_redis_host, | |
port=JobProjectConfiguration.url_queue_redis_port, | |
db=JobProjectConfiguration.url_queue_redis_db | |
) | |
redis_proxy_pool = redis.ConnectionPool( | |
host=JobProjectConfiguration.proxy_queue_redis_host, | |
port=JobProjectConfiguration.proxy_queue_redis_port, | |
db=JobProjectConfiguration.proxy_queue_redis_db | |
) | |
proxy_pool = [] | |
pool_num = 100 | |
header = { | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Accept-Encoding": "gzip,deflate,sdch", | |
"Accept-Language": "zh-CN,zh;q=0.8", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" | |
} | |
def WYUrlGenerator(): | |
print '51 Dig start : the url...' | |
start = time.time() | |
redis_db = redis.Redis(connection_pool=redis_connection) | |
urllist = WYJobUrlYield() | |
gpool = Pool(pool_num) | |
for uargs in urllist: | |
gpool.spawn(GenerateUrl, uargs) | |
gpool.join() | |
# 从这里开始,循环的从错误url集合里面取url,直至取完所有的 | |
length = redis_db.scard("error_url_list") | |
while length > 0: | |
errorlist = ErrorUrlGenerator() | |
epool = Pool(pool_num) | |
for url in errorlist: | |
epool.spawn(GenerateUrl, url) | |
epool.join() | |
length = redis_db.scard("error_url_list") | |
end = time.time() | |
print 'dig end : the url...all spend time is %0.2f' % (end - start) | |
def WYJobUrlYield(): | |
for page in xrange(3000): | |
page += 1 | |
url = "http://some.crawl.url with page num %s" % page | |
jobitem = { | |
"url": url, | |
"type": "jobtype" | |
} | |
jobvalue = json.dumps(jobitem) | |
yield jobvalue | |
#从错误url的集合里面取出url 再次处理 | |
def ErrorUrlGenerator(): | |
redis_db = redis.Redis(connection_pool=redis_connection) | |
urllist = redis_db.smembers("error_url_list") | |
for url in urllist: | |
yield url | |
def GenerateUrl(sourcejob): | |
redis_db = redis.StrictRedis(connection_pool=redis_connection) | |
pipe = redis_db.pipeline() | |
newitem = json.loads(sourcejob) | |
url = newitem["url"] | |
urltype = newitem["type"] | |
try: | |
ip = proxy_pool.getProxy() | |
proxy = {"http": "http://"+ip["proxy"]} | |
timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误 | |
timeout.start() | |
r = requests.get(url, headers=header, proxies=proxy) | |
jobs = BeautifulSoup(r.text) | |
if urltype == "urltype": #获取页面的所有url,然后保存到redis的一个set里面 | |
results = jobs.findAll("a", {"class": "classname"}) | |
for result in results: | |
url = result["href"] | |
urlitem = { | |
"url": url, | |
"type": "urltype" | |
} | |
urlvalue = json.dumps(urlitem) | |
pipe.sadd("url_list", urlitem) # 这里将获取的url保存至url_list 这个redis集合里面 | |
pipe.srem("error_url_list", sourcejob) #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉 | |
pipe.execute() | |
except Exception as e: | |
error_name = e.__class__.__name__ | |
if error_name == "ConnectionError" or error_name == "ProxyError": #通过判断错误类型(因为一些链接或者代理错误,我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理) | |
redis_db.sadd('error_url_list', sourcejob) | |
#现在我面临最恼火的问题就是其它比较正常,就在这里,当程序开启的时候,偶尔会出现sadd抛出异常 | |
#因为这里是出了异常才在这里处理错误的url的(将它保存到error_url_list,供后来的再处理),现在添加的时候出了异常, | |
#这样就会让一大部分错误的url无法保存到对应的数据库,最后导致爬到的数据太少, | |
#异常信息大致为: | |
# ConnectionError | |
# <timer at 0x36c8c80 callback=<bound method Greenlet.throw of <Greenlet at 0xc844050>> args=(<class 'requests.exceptions.ConnectionError'>,)> failed with ConnectionError | |
# Traceback (most recent call last): | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/greenlet.py", line 327, in run | |
# result = self._run(*self.args, **self.kwargs) | |
# File "61.py", line 147, in GenerateUrl | |
# redis_db.sadd('error_url_list', sourcejob) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 1248, in sadd | |
# return self.execute_command('SADD', name, *values) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 461, in execute_command | |
# return self.parse_response(connection, command_name, **options) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 471, in parse_response | |
# response = connection.read_response() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 339, in read_response | |
# response = self._parser.read_response() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 110, in read_response | |
# response = self.read() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 103, in read | |
# return self._fp.readline()[:-2] | |
# File "/usr/local/lib/python2.7/socket.py", line 447, in readline | |
# data = self._sock.recv(self._rbufsize) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 392, in recv | |
# self._wait(self._read_event) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 298, in _wait | |
# self.hub.wait(watcher) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 341, in wait | |
# result = waiter.get() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 568, in get | |
# return self.hub.switch() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 331, in switch | |
# return greenlet.switch(self) | |
if __name__ == '__main__': | |
st = time.time() | |
time.sleep(5) | |
WYUrlGenerator() | |
et = time.time() | |
print "**************end****************,the spend time is %0.2f" % (et - st) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment