Created
July 25, 2016 09:31
-
-
Save pradeepbn/98c73d99b2f0c0dab845567bba3c7b97 to your computer and use it in GitHub Desktop.
Scrapy settings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BOT_NAME = 'BusinessContacts' | |
SPIDER_MODULES = ['BusinessContacts.spiders'] | |
NEWSPIDER_MODULE = 'BusinessContacts.spiders' | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'BusinessContacts (+http://www.yourdomain.com)' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = True | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 1 | |
# Configure a delay for requests for the same website (default: 0) | |
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
DOWNLOAD_DELAY = 4 | |
## The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 64 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
#CONCURRENT_ITEMS = 400 | |
RETRY_ENABLED = True | |
COOKIES_ENABLED = True | |
### More comprehensive list can be found at | |
### http://techpatterns.com/forums/about304.html | |
USER_AGENT_LIST = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7', | |
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10' | |
] | |
# Disable cookies (enabled by default) | |
#COOKIES_ENABLED = False | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
DEFAULT_REQUEST_HEADERS = { | |
#'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
#'Accept-Language': 'en', | |
#'X-Crawlera-UA' : 'desktop', | |
'X-Crawlera-Max-Retries': 2, | |
#'X-Crawlera-Debug': 'ua', | |
#'X-Crawlera-JobId' : 999, | |
} | |
# Enable or disable spider middlewares | |
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | |
SPIDER_MIDDLEWARES = { | |
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, | |
} | |
# Enable or disable downloader middlewares | |
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | |
#HTTP_PROXY = 'http://127.0.0.1:8123' | |
#DOWNLOADER_MIDDLEWARES = { | |
# # Disable compression middleware, so the actual HTML pages are cached | |
#} | |
DOWNLOADER_MIDDLEWARES = { | |
'scrapy_crawlera.CrawleraMiddleware': 300, | |
'scrapy_splash.SplashCookiesMiddleware': 723, | |
'scrapy_splash.SplashMiddleware': 725, | |
} | |
SPLASH_URL = 'http://localhost:8050/' | |
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' | |
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' | |
CRAWLERA_ENABLED = True | |
CRAWLERA_APIKEY = '<API_KEY>' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment