Skip to content

Instantly share code, notes, and snippets.

@c0ldlimit
Forked from pkmishra/middlewares.py
Created July 11, 2013 03:45
Show Gist options
  • Save c0ldlimit/5972387 to your computer and use it in GitHub Desktop.
Save c0ldlimit/5972387 to your computer and use it in GitHub Desktop.
import os
import random
from scrapy.conf import settings
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
ua = random.choice(settings.get('USER_AGENT_LIST'))
if ua:
request.headers.setdefault('User-Agent', ua)
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = settings.get('HTTP_PROXY')
# More comprehensive list can be found at
# http://techpatterns.com/forums/about304.html
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
]
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.RandomUserAgentMiddleware': 400,
'myproject.middlewares.ProxyMiddleware': 410,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
# Disable compression middleware, so the actual HTML pages are cached
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment