jerryan999 · July 26, 2018 13:44
diff --git a/proxymiddleware.py b/proxymiddleware.py
 class ProxyMiddleware(object):
  """ Customized Proxy Middleware
      No matter success or fail, change proxy for every request 
  """

  # Change another proxy instead of passing to RetryMiddlewares when met these errors
  DONT_RETRY_ERRORS = (TimeoutError,ConnectionRefusedError,TCPTimedOutError,
                      ResponseNeverReceived, ConnectError, ConnectBindError, TunnelError)

  agent_list = []

  def __init__(self,settings):

    config = settings.get('CONFIG')
    try:
      if (config is not None) and ('mongo_proxy' in config):
        self.client = pymongo.MongoClient(config['mongo_proxy']['hosts'])
    except:
      logger.error("Wrong configuration,Please check database config file")
      raise
    self.db = self.client[config['mongo_proxy']['database']]
    self.mongo_collection = self.db[config['mongo_proxy']['collection']]
    self.max_pool_size = settings.get('MAX_POOL_SIZE',300)
    self.min_pool_size = settings.get('MIN_POOL_SIZE',10)
    self.pool_waiting = settings.get('POOL_WAITING',30)
    self.better_percent = settings.get('BETTER_PERCENT',0.50)
    self.better_max_count = settings.get('BETTER_MAX_COUNT',120)
    self.maintaining_interval = settings.get('MAINTAIN_INTERVAL',360) 

  @classmethod
  def from_crawler(cls, crawler):  
    settings = crawler.settings
    o = cls(settings)
    o.crawler=crawler
    crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
    crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
    return o

  def readProxyfile(self):
    logger.debug("Starting getting fresh proxies")
    # fetch _id value
    for document in self.mongo_collection.find():
      agent = Agent(document['_id'],success=1000, percentage=1) if document.get('label')=="purchased" else Agent(document['_id'])
      if agent not in self.agent_list:
        self.agent_list.append(agent)

  def maintaining_agent(self):
    """ if available agent number is below some level such as 80, we fill up the agent list
    """
    # remove invalid
    self.agent_list = list(filter(lambda x: x.is_valid(),self.agent_list))

    # handle increasing size problem
    max_proxy_size = self.max_pool_size
    if len(self.agent_list)>max_proxy_size:
      logger.debug("Proxy list is too big, here cuts the low part")
      sortedagentlist = sorted(self.agent_list, key = lambda i: i.percentage)
      self.agent_list = sortedagentlist[len(self.agent_list)-max_proxy_size:]

    # add more proxy into pool
    self.readProxyfile()

  def get_proxy_slot(self, proxy):
    """
    Return downloader slot for a proxy.
    By default it doesn't take port in account, i.e. all proxies with
    the same hostname / ip address share the same slot.
    """
    return urlsplit(proxy).hostname

  def process_request(self, request, spider):
    """ Make request with agent
    """
    if spider.name == "ZillowSpider" and "zillow" not in request.url[:22]:
      raise IgnoreRequest
    if request.url=="https://www.zillow.com:443/homes/":
      raise IgnoreRequest

    request.meta['agent'] = random.choice(list(filter(lambda x: x.is_valid(),self.agent_list)))
    request.meta['proxy'] = request.meta['agent'].proxy
    request.meta['download_slot'] = self.get_proxy_slot(request.meta['proxy'])
    logger.debug("Request %(request)s using proxy:%(proxy)s",
                    {'request':request, 'proxy':request.meta['proxy']})

  def _new_request_from_response(self,request):
    new_request = request.copy() 
    new_request.dont_filter = True
    return new_request

  def process_exception(self, request, exception, spider):
    """Handle some connection error, make another request when these error happens
    """        
    agent = request.meta.get('agent')
    for i in range(3):
      agent.weaken()
    if isinstance(exception,self.DONT_RETRY_ERRORS):
      logger.debug("Normal exception happened  proxy:{} for processing {}".format(request.meta['proxy'],request.url))
      agent.weaken()
      return self._new_request_from_response(request)

  def spider_opened(self,spider):
    self.task = task.LoopingCall(self.maintaining_agent)
    self.task.start(self.maintaining_interval)  # every 1 min = 60s

  def spider_closed(self, spider, reason):
    if self.task and self.task.running:
      self.task.stop()
    self.client.close()

  def process_response(self, request, response, spider):
    agent = request.meta.get('agent')
    reason = response_status_message(response.status)

    # handle different proxy situation
    proxy_func = self.mapping_proxy(spider)
    if proxy_func:
      return proxy_func(request,response,agent,reason)

  def mapping_proxy(self,spider):
    if spider.name == 'AirbnbSpider':
      return self.airbnb_proxy
    elif spider.name == 'ZillowSpider':
      return self.zillow_proxy
    elif spider.name == 'CraigSpider':
      return self.craig_proxy

  def zillow_proxy(self,request,response,agent,reason):
    """ Check response status and other validation info to decide whether to change a proxy or not
    """
    if "zillow.service.search.SearchURL" in response.url:
      raise IgnoreRequest
    if response.status == 200:
      if response.body:     # sometimes empty return page but with 200 code
        logger.debug("Good proxy:{} for processing {}".format(request.meta['proxy'],response))
        if "AuthRequired" in response.url:
          logger.debug("Status 200 to authrequired page, so we ignore it:{}".format(request))
          raise IgnoreRequest
        elif "captcha" in response.url:
          logger.debug("Proxy:{} meets captcha when processing {}".format(request.meta['proxy'],response))
          agent.weaken()
          if "zpid" in response.url:
            url_pa = re.search('url=(.+zpid)',response.url)
            if url_pa:
              url = "https://www.zillow.com"+url_pa.group(1).replace('%2f','/')
              return Request(url=url,dont_filter=True,callback=self.crawler.spider.parse_apt_or_building)
          else:
            raise IgnoreRequest
        else:
          agent.stronger()
      else:
        logger.debug("Fake  proxy:{} for processing {}".format(request.meta['proxy'],response))
        agent.weaken()  
        return self._new_request_from_response(request)           
      return response

    elif response.status in [302,307]:
      # Redirecting (302) to <GET https://www.zillow.com/captcha/?dest=qiMbUzSCa1MGIlhrB-2stg> from <GET https://www.zillow.com/
      location = safe_url_string(response.headers['location'])
      redirected_url = urljoin(request.url, location)

      if response.headers.get('location'):
        if b'captcha' in response.headers[b'Location'] :
          logger.debug("Redirecting (302) to captcha including url, so we make a new request for url:{}".format(request))
          for k in range(2):
            agent.weaken()
          return self._new_request_from_response(request)
        if b"AuthRequired" in response.headers[b'Location']:
          logger.debug("Redirecting (302) to authrequired page, so we ignore it:{}".format(request))
          agent.stronger()
          raise IgnoreRequest
        return response
      return response

    elif response.status ==403:
      if 'zillow.com' in response.url:
        agent.set_invalid()
        logger.info("Proxy: {} meet {} ".format(agent.proxy,reason))
        return self._new_request_from_response(request)
      else:
        raise IgnoreRequest

    elif response.status == 404:
      if "alogin" in response.url:
        raise IgnoreRequest

    return response
	class ProxyMiddleware(object):
	""" Customized Proxy Middleware
	No matter success or fail, change proxy for every request
	"""

	# Change another proxy instead of passing to RetryMiddlewares when met these errors
	DONT_RETRY_ERRORS = (TimeoutError,ConnectionRefusedError,TCPTimedOutError,
	ResponseNeverReceived, ConnectError, ConnectBindError, TunnelError)

	agent_list = []

	def __init__(self,settings):

	config = settings.get('CONFIG')
	try:
	if (config is not None) and ('mongo_proxy' in config):
	self.client = pymongo.MongoClient(config['mongo_proxy']['hosts'])
	except:
	logger.error("Wrong configuration,Please check database config file")
	raise
	self.db = self.client[config['mongo_proxy']['database']]
	self.mongo_collection = self.db[config['mongo_proxy']['collection']]
	self.max_pool_size = settings.get('MAX_POOL_SIZE',300)
	self.min_pool_size = settings.get('MIN_POOL_SIZE',10)
	self.pool_waiting = settings.get('POOL_WAITING',30)
	self.better_percent = settings.get('BETTER_PERCENT',0.50)
	self.better_max_count = settings.get('BETTER_MAX_COUNT',120)
	self.maintaining_interval = settings.get('MAINTAIN_INTERVAL',360)

	@classmethod
	def from_crawler(cls, crawler):
	settings = crawler.settings
	o = cls(settings)
	o.crawler=crawler
	crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
	crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
	return o

	def readProxyfile(self):
	logger.debug("Starting getting fresh proxies")
	# fetch _id value
	for document in self.mongo_collection.find():
	agent = Agent(document['_id'],success=1000, percentage=1) if document.get('label')=="purchased" else Agent(document['_id'])
	if agent not in self.agent_list:
	self.agent_list.append(agent)

	def maintaining_agent(self):
	""" if available agent number is below some level such as 80, we fill up the agent list
	"""
	# remove invalid
	self.agent_list = list(filter(lambda x: x.is_valid(),self.agent_list))

	# handle increasing size problem
	max_proxy_size = self.max_pool_size
	if len(self.agent_list)>max_proxy_size:
	logger.debug("Proxy list is too big, here cuts the low part")
	sortedagentlist = sorted(self.agent_list, key = lambda i: i.percentage)
	self.agent_list = sortedagentlist[len(self.agent_list)-max_proxy_size:]

	# add more proxy into pool
	self.readProxyfile()

	def get_proxy_slot(self, proxy):
	"""
	Return downloader slot for a proxy.
	By default it doesn't take port in account, i.e. all proxies with
	the same hostname / ip address share the same slot.
	"""
	return urlsplit(proxy).hostname

	def process_request(self, request, spider):
	""" Make request with agent
	"""
	if spider.name == "ZillowSpider" and "zillow" not in request.url[:22]:
	raise IgnoreRequest
	if request.url=="https://www.zillow.com:443/homes/":
	raise IgnoreRequest

	request.meta['agent'] = random.choice(list(filter(lambda x: x.is_valid(),self.agent_list)))
	request.meta['proxy'] = request.meta['agent'].proxy
	request.meta['download_slot'] = self.get_proxy_slot(request.meta['proxy'])
	logger.debug("Request %(request)s using proxy:%(proxy)s",
	{'request':request, 'proxy':request.meta['proxy']})

	def _new_request_from_response(self,request):
	new_request = request.copy()
	new_request.dont_filter = True
	return new_request

	def process_exception(self, request, exception, spider):
	"""Handle some connection error, make another request when these error happens
	"""
	agent = request.meta.get('agent')
	for i in range(3):
	agent.weaken()
	if isinstance(exception,self.DONT_RETRY_ERRORS):
	logger.debug("Normal exception happened proxy:{} for processing {}".format(request.meta['proxy'],request.url))
	agent.weaken()
	return self._new_request_from_response(request)

	def spider_opened(self,spider):
	self.task = task.LoopingCall(self.maintaining_agent)
	self.task.start(self.maintaining_interval) # every 1 min = 60s

	def spider_closed(self, spider, reason):
	if self.task and self.task.running:
	self.task.stop()
	self.client.close()

	def process_response(self, request, response, spider):
	agent = request.meta.get('agent')
	reason = response_status_message(response.status)

	# handle different proxy situation
	proxy_func = self.mapping_proxy(spider)
	if proxy_func:
	return proxy_func(request,response,agent,reason)

	def mapping_proxy(self,spider):
	if spider.name == 'AirbnbSpider':
	return self.airbnb_proxy
	elif spider.name == 'ZillowSpider':
	return self.zillow_proxy
	elif spider.name == 'CraigSpider':
	return self.craig_proxy

	def zillow_proxy(self,request,response,agent,reason):
	""" Check response status and other validation info to decide whether to change a proxy or not
	"""
	if "zillow.service.search.SearchURL" in response.url:
	raise IgnoreRequest
	if response.status == 200:
	if response.body: # sometimes empty return page but with 200 code
	logger.debug("Good proxy:{} for processing {}".format(request.meta['proxy'],response))
	if "AuthRequired" in response.url:
	logger.debug("Status 200 to authrequired page, so we ignore it:{}".format(request))
	raise IgnoreRequest
	elif "captcha" in response.url:
	logger.debug("Proxy:{} meets captcha when processing {}".format(request.meta['proxy'],response))
	agent.weaken()
	if "zpid" in response.url:
	url_pa = re.search('url=(.+zpid)',response.url)
	if url_pa:
	url = "https://www.zillow.com"+url_pa.group(1).replace('%2f','/')
	return Request(url=url,dont_filter=True,callback=self.crawler.spider.parse_apt_or_building)
	else:
	raise IgnoreRequest
	else:
	agent.stronger()
	else:
	logger.debug("Fake proxy:{} for processing {}".format(request.meta['proxy'],response))
	agent.weaken()
	return self._new_request_from_response(request)
	return response

	elif response.status in [302,307]:
	# Redirecting (302) to <GET https://www.zillow.com/captcha/?dest=qiMbUzSCa1MGIlhrB-2stg> from <GET https://www.zillow.com/
	location = safe_url_string(response.headers['location'])
	redirected_url = urljoin(request.url, location)

	if response.headers.get('location'):
	if b'captcha' in response.headers[b'Location'] :
	logger.debug("Redirecting (302) to captcha including url, so we make a new request for url:{}".format(request))
	for k in range(2):
	agent.weaken()
	return self._new_request_from_response(request)
	if b"AuthRequired" in response.headers[b'Location']:
	logger.debug("Redirecting (302) to authrequired page, so we ignore it:{}".format(request))
	agent.stronger()
	raise IgnoreRequest
	return response
	return response

	elif response.status ==403:
	if 'zillow.com' in response.url:
	agent.set_invalid()
	logger.info("Proxy: {} meet {} ".format(agent.proxy,reason))
	return self._new_request_from_response(request)
	else:
	raise IgnoreRequest

	elif response.status == 404:
	if "alogin" in response.url:
	raise IgnoreRequest

	return response