-
-
Save pkmishra/5193452 to your computer and use it in GitHub Desktop.
| import os | |
| import random | |
| from scrapy.conf import settings | |
| class RandomUserAgentMiddleware(object): | |
| def process_request(self, request, spider): | |
| ua = random.choice(settings.get('USER_AGENT_LIST')) | |
| if ua: | |
| request.headers.setdefault('User-Agent', ua) | |
| class ProxyMiddleware(object): | |
| def process_request(self, request, spider): | |
| request.meta['proxy'] = settings.get('HTTP_PROXY') |
| # More comprehensive list can be found at | |
| # http://techpatterns.com/forums/about304.html | |
| USER_AGENT_LIST = [ | |
| 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7', | |
| 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0', | |
| 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10' | |
| ] | |
| DOWNLOADER_MIDDLEWARES = { | |
| 'myproject.middlewares.RandomUserAgentMiddleware': 400, | |
| 'myproject.middlewares.ProxyMiddleware': 410, | |
| 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, | |
| # Disable compression middleware, so the actual HTML pages are cached | |
| } |
To anybody trying to implement this using Scrapy 1.0 or beyond settings.get throws an attribute error. [https://stackoverflow.com/questions/32984597/scrapy-attributeerror-settings-object-has-no-attribute-update-settings](Much the same error as this.)
A workaround can be achieved by using the following code in your middleware.py. (Will add this in a public repo soon. Apologies for terrible code format.)
`
import os
import random
from scrapy.settings import Settings
class RandomUserAgentMiddleware(object):
def init(self, crawler):
self._user_agent_list=crawler.settings.get('USER_AGENT_LIST')
self._http_proxy=crawler.settings.get('HTTP_PROXY')
self.crawler = crawler
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
ua = random.choice(self._user_agent_list)
if ua:
request.headers.setdefault('User-Agent', ua)
class ProxyMiddleware(object):
def init(self, crawler):
self._user_agent_list=crawler.settings.get('USER_AGENT_LIST')
self._http_proxy=crawler.settings.get('HTTP_PROXY')
self.crawler = crawler
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
request.meta['proxy'] = self._http_proxy`
Hello,
Can i see what you have in
from scrapy.conf import settings?Thank you in advance.
Edit: Ignore this, my bad