Created
July 26, 2018 13:44
-
-
Save jerryan999/4b2a17cc3d0d4852e3de7a4d68737c8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ProxyMiddleware(object): | |
""" Customized Proxy Middleware | |
No matter success or fail, change proxy for every request | |
""" | |
# Change another proxy instead of passing to RetryMiddlewares when met these errors | |
DONT_RETRY_ERRORS = (TimeoutError,ConnectionRefusedError,TCPTimedOutError, | |
ResponseNeverReceived, ConnectError, ConnectBindError, TunnelError) | |
agent_list = [] | |
def __init__(self,settings): | |
config = settings.get('CONFIG') | |
try: | |
if (config is not None) and ('mongo_proxy' in config): | |
self.client = pymongo.MongoClient(config['mongo_proxy']['hosts']) | |
except: | |
logger.error("Wrong configuration,Please check database config file") | |
raise | |
self.db = self.client[config['mongo_proxy']['database']] | |
self.mongo_collection = self.db[config['mongo_proxy']['collection']] | |
self.max_pool_size = settings.get('MAX_POOL_SIZE',300) | |
self.min_pool_size = settings.get('MIN_POOL_SIZE',10) | |
self.pool_waiting = settings.get('POOL_WAITING',30) | |
self.better_percent = settings.get('BETTER_PERCENT',0.50) | |
self.better_max_count = settings.get('BETTER_MAX_COUNT',120) | |
self.maintaining_interval = settings.get('MAINTAIN_INTERVAL',360) | |
@classmethod | |
def from_crawler(cls, crawler): | |
settings = crawler.settings | |
o = cls(settings) | |
o.crawler=crawler | |
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) | |
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) | |
return o | |
def readProxyfile(self): | |
logger.debug("Starting getting fresh proxies") | |
# fetch _id value | |
for document in self.mongo_collection.find(): | |
agent = Agent(document['_id'],success=1000, percentage=1) if document.get('label')=="purchased" else Agent(document['_id']) | |
if agent not in self.agent_list: | |
self.agent_list.append(agent) | |
def maintaining_agent(self): | |
""" if available agent number is below some level such as 80, we fill up the agent list | |
""" | |
# remove invalid | |
self.agent_list = list(filter(lambda x: x.is_valid(),self.agent_list)) | |
# handle increasing size problem | |
max_proxy_size = self.max_pool_size | |
if len(self.agent_list)>max_proxy_size: | |
logger.debug("Proxy list is too big, here cuts the low part") | |
sortedagentlist = sorted(self.agent_list, key = lambda i: i.percentage) | |
self.agent_list = sortedagentlist[len(self.agent_list)-max_proxy_size:] | |
# add more proxy into pool | |
self.readProxyfile() | |
def get_proxy_slot(self, proxy): | |
""" | |
Return downloader slot for a proxy. | |
By default it doesn't take port in account, i.e. all proxies with | |
the same hostname / ip address share the same slot. | |
""" | |
return urlsplit(proxy).hostname | |
def process_request(self, request, spider): | |
""" Make request with agent | |
""" | |
if spider.name == "ZillowSpider" and "zillow" not in request.url[:22]: | |
raise IgnoreRequest | |
if request.url=="https://www.zillow.com:443/homes/": | |
raise IgnoreRequest | |
request.meta['agent'] = random.choice(list(filter(lambda x: x.is_valid(),self.agent_list))) | |
request.meta['proxy'] = request.meta['agent'].proxy | |
request.meta['download_slot'] = self.get_proxy_slot(request.meta['proxy']) | |
logger.debug("Request %(request)s using proxy:%(proxy)s", | |
{'request':request, 'proxy':request.meta['proxy']}) | |
def _new_request_from_response(self,request): | |
new_request = request.copy() | |
new_request.dont_filter = True | |
return new_request | |
def process_exception(self, request, exception, spider): | |
"""Handle some connection error, make another request when these error happens | |
""" | |
agent = request.meta.get('agent') | |
for i in range(3): | |
agent.weaken() | |
if isinstance(exception,self.DONT_RETRY_ERRORS): | |
logger.debug("Normal exception happened proxy:{} for processing {}".format(request.meta['proxy'],request.url)) | |
agent.weaken() | |
return self._new_request_from_response(request) | |
def spider_opened(self,spider): | |
self.task = task.LoopingCall(self.maintaining_agent) | |
self.task.start(self.maintaining_interval) # every 1 min = 60s | |
def spider_closed(self, spider, reason): | |
if self.task and self.task.running: | |
self.task.stop() | |
self.client.close() | |
def process_response(self, request, response, spider): | |
agent = request.meta.get('agent') | |
reason = response_status_message(response.status) | |
# handle different proxy situation | |
proxy_func = self.mapping_proxy(spider) | |
if proxy_func: | |
return proxy_func(request,response,agent,reason) | |
def mapping_proxy(self,spider): | |
if spider.name == 'AirbnbSpider': | |
return self.airbnb_proxy | |
elif spider.name == 'ZillowSpider': | |
return self.zillow_proxy | |
elif spider.name == 'CraigSpider': | |
return self.craig_proxy | |
def zillow_proxy(self,request,response,agent,reason): | |
""" Check response status and other validation info to decide whether to change a proxy or not | |
""" | |
if "zillow.service.search.SearchURL" in response.url: | |
raise IgnoreRequest | |
if response.status == 200: | |
if response.body: # sometimes empty return page but with 200 code | |
logger.debug("Good proxy:{} for processing {}".format(request.meta['proxy'],response)) | |
if "AuthRequired" in response.url: | |
logger.debug("Status 200 to authrequired page, so we ignore it:{}".format(request)) | |
raise IgnoreRequest | |
elif "captcha" in response.url: | |
logger.debug("Proxy:{} meets captcha when processing {}".format(request.meta['proxy'],response)) | |
agent.weaken() | |
if "zpid" in response.url: | |
url_pa = re.search('url=(.+zpid)',response.url) | |
if url_pa: | |
url = "https://www.zillow.com"+url_pa.group(1).replace('%2f','/') | |
return Request(url=url,dont_filter=True,callback=self.crawler.spider.parse_apt_or_building) | |
else: | |
raise IgnoreRequest | |
else: | |
agent.stronger() | |
else: | |
logger.debug("Fake proxy:{} for processing {}".format(request.meta['proxy'],response)) | |
agent.weaken() | |
return self._new_request_from_response(request) | |
return response | |
elif response.status in [302,307]: | |
# Redirecting (302) to <GET https://www.zillow.com/captcha/?dest=qiMbUzSCa1MGIlhrB-2stg> from <GET https://www.zillow.com/ | |
location = safe_url_string(response.headers['location']) | |
redirected_url = urljoin(request.url, location) | |
if response.headers.get('location'): | |
if b'captcha' in response.headers[b'Location'] : | |
logger.debug("Redirecting (302) to captcha including url, so we make a new request for url:{}".format(request)) | |
for k in range(2): | |
agent.weaken() | |
return self._new_request_from_response(request) | |
if b"AuthRequired" in response.headers[b'Location']: | |
logger.debug("Redirecting (302) to authrequired page, so we ignore it:{}".format(request)) | |
agent.stronger() | |
raise IgnoreRequest | |
return response | |
return response | |
elif response.status ==403: | |
if 'zillow.com' in response.url: | |
agent.set_invalid() | |
logger.info("Proxy: {} meet {} ".format(agent.proxy,reason)) | |
return self._new_request_from_response(request) | |
else: | |
raise IgnoreRequest | |
elif response.status == 404: | |
if "alogin" in response.url: | |
raise IgnoreRequest | |
return response |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment