pradeepbn · July 25, 2016 09:31
diff --git a/settings.py b/settings.py
 BOT_NAME = 'BusinessContacts'

 SPIDER_MODULES = ['BusinessContacts.spiders']
 NEWSPIDER_MODULE = 'BusinessContacts.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'BusinessContacts (+http://www.yourdomain.com)'

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 1

 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 DOWNLOAD_DELAY = 4
 ## The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 64
 #CONCURRENT_REQUESTS_PER_IP = 16
 #CONCURRENT_ITEMS = 400
 RETRY_ENABLED = True
 COOKIES_ENABLED = True
 ### More comprehensive list can be found at 
 ### http://techpatterns.com/forums/about304.html
 USER_AGENT_LIST = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
 ]
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
 DEFAULT_REQUEST_HEADERS = {
   #'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   #'Accept-Language': 'en', 
   #'X-Crawlera-UA' : 'desktop',
   'X-Crawlera-Max-Retries': 2,
   #'X-Crawlera-Debug': 'ua',
   #'X-Crawlera-JobId' : 999,
 }

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 }

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #HTTP_PROXY = 'http://127.0.0.1:8123'
 #DOWNLOADER_MIDDLEWARES = {
 #    # Disable compression middleware, so the actual HTML pages are cached
 #}  
 DOWNLOADER_MIDDLEWARES = {
    'scrapy_crawlera.CrawleraMiddleware': 300,
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
 }
 SPLASH_URL = 'http://localhost:8050/'
 DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 CRAWLERA_ENABLED = True
 CRAWLERA_APIKEY = '<API_KEY>'
	BOT_NAME = 'BusinessContacts'

	SPIDER_MODULES = ['BusinessContacts.spiders']
	NEWSPIDER_MODULE = 'BusinessContacts.spiders'


	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	#USER_AGENT = 'BusinessContacts (+http://www.yourdomain.com)'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = True

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	#CONCURRENT_REQUESTS = 1

	# Configure a delay for requests for the same website (default: 0)
	# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	DOWNLOAD_DELAY = 4
	## The download delay setting will honor only one of:
	#CONCURRENT_REQUESTS_PER_DOMAIN = 64
	#CONCURRENT_REQUESTS_PER_IP = 16
	#CONCURRENT_ITEMS = 400
	RETRY_ENABLED = True
	COOKIES_ENABLED = True
	### More comprehensive list can be found at
	### http://techpatterns.com/forums/about304.html
	USER_AGENT_LIST = [
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
	'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
	]
	# Disable cookies (enabled by default)
	#COOKIES_ENABLED = False

	# Disable Telnet Console (enabled by default)
	#TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	DEFAULT_REQUEST_HEADERS = {
	#'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	#'Accept-Language': 'en',
	#'X-Crawlera-UA' : 'desktop',
	'X-Crawlera-Max-Retries': 2,
	#'X-Crawlera-Debug': 'ua',
	#'X-Crawlera-JobId' : 999,
	}

	# Enable or disable spider middlewares
	# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
	SPIDER_MIDDLEWARES = {
	'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
	}

	# Enable or disable downloader middlewares
	# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
	#HTTP_PROXY = 'http://127.0.0.1:8123'
	#DOWNLOADER_MIDDLEWARES = {
	# # Disable compression middleware, so the actual HTML pages are cached
	#}
	DOWNLOADER_MIDDLEWARES = {
	'scrapy_crawlera.CrawleraMiddleware': 300,
	'scrapy_splash.SplashCookiesMiddleware': 723,
	'scrapy_splash.SplashMiddleware': 725,
	}
	SPLASH_URL = 'http://localhost:8050/'
	DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
	HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
	CRAWLERA_ENABLED = True
	CRAWLERA_APIKEY = '<API_KEY>'
No results found