-
-
Save renshuki/df14d2ad6e8eacd404641abc4a3b77eb to your computer and use it in GitHub Desktop.
An example of RotateUserAgentMiddleware
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import choice | |
from scrapy import signals | |
from scrapy.exceptions import NotConfigured | |
class RotateUserAgentMiddleware(object): | |
"""Rotate user-agent for each request.""" | |
def __init__(self, user_agents): | |
self.enabled = False | |
self.user_agents = user_agents | |
@classmethod | |
def from_crawler(cls, crawler): | |
user_agents = crawler.settings.get('USER_AGENT_CHOICES', []) | |
if not user_agents: | |
raise NotConfigured("USER_AGENT_CHOICES not set or empty") | |
o = cls(user_agents) | |
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) | |
return o | |
def spider_opened(self, spider): | |
self.enabled = getattr(spider, 'rotate_user_agent', self.enabled) | |
def process_request(self, request, spider): | |
if not self.enabled or not self.user_agents: | |
return | |
request.headers['user-agent'] = choice(self.user_agents) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import Spider | |
class ProjectSpider(Spider): | |
name = 'project-website.com' | |
rotate_user_agent = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DOWNLOADER_MIDDLEWARES = { | |
'project.middlewares.RotateUserAgentMiddleware': 110, | |
} | |
USER_AGENT_CHOICES = [ | |
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20140205 Firefox/24.0 Iceweasel/24.3.0', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:28.0) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment