Last active
May 31, 2016 20:15
-
-
Save khellan/b7b337ee00816ea0e9b8740783d7b40a to your computer and use it in GitHub Desktop.
Frontera scrapy fetch error
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2016-05-31 21:08:31 [scrapy] INFO: Scrapy 1.1.0 started (bot: cb_crawl) | |
2016-05-31 21:08:31 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'cb_crawl.spiders', 'DOWNLOAD_TIMEOUT': 60, 'ROBOTSTXT_OBEY': True, 'DEPTH_LIMIT': 10, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'CONCURRENT_REQUESTS': 256, 'RETRY_ENABLED': False, 'SPIDER_MODULES': ['cb_crawl.spiders'], 'AUTOTHROTTLE_START_DELAY': 0.25, 'REACTOR_THREADPOOL_MAXSIZE': 20, 'BOT_NAME': 'cb_crawl', 'AJAXCRAWL_ENABLED': True, 'COOKIES_ENABLED': False, 'USER_AGENT': 'cb crawl (+http://www.companybooknetworking.com)', 'SCHEDULER': 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler', 'REDIRECT_ENABLED': False, 'AUTOTHROTTLE_ENABLED': True, 'DOWNLOAD_DELAY': 0.25} | |
2016-05-31 21:08:31 [scrapy] INFO: Enabled extensions: | |
['scrapy.extensions.logstats.LogStats', | |
'scrapy.extensions.telnet.TelnetConsole', | |
'scrapy.extensions.corestats.CoreStats', | |
'scrapy.extensions.throttle.AutoThrottle'] | |
2016-05-31 21:08:31 [scrapy] INFO: Enabled downloader middlewares: | |
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware', | |
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', | |
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', | |
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', | |
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', | |
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware', | |
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', | |
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', | |
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware', | |
'scrapy.downloadermiddlewares.stats.DownloaderStats', | |
'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware'] | |
2016-05-31 21:08:31 [scrapy] INFO: Enabled spider middlewares: | |
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', | |
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', | |
'scrapy.spidermiddlewares.referer.RefererMiddleware', | |
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', | |
'scrapy.spidermiddlewares.depth.DepthMiddleware', | |
'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware'] | |
2016-05-31 21:08:31 [scrapy] INFO: Enabled item pipelines: | |
[] | |
2016-05-31 21:08:31 [scrapy] INFO: Spider opened | |
2016-05-31 21:08:31 [frontera.contrib.scrapy.schedulers.FronteraScheduler] INFO: Starting frontier | |
2016-05-31 21:08:31 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) | |
2016-05-31 21:08:31 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 | |
Unhandled Error | |
Traceback (most recent call last): | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/commands/fetch.py", line 62, in run | |
self.crawler_process.start() | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/crawler.py", line 280, in start | |
reactor.run(installSignalHandlers=False) # blocking call | |
File "/home/khellan/venv/lib/python2.7/site-packages/twisted/internet/base.py", line 1194, in run | |
self.mainLoop() | |
File "/home/khellan/venv/lib/python2.7/site-packages/twisted/internet/base.py", line 1203, in mainLoop | |
self.runUntilCurrent() | |
--- <exception caught here> --- | |
File "/home/khellan/venv/lib/python2.7/site-packages/twisted/internet/base.py", line 825, in runUntilCurrent | |
call.func(*call.args, **call.kw) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__ | |
return self._func(*self._a, **self._kw) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/core/engine.py", line 134, in _next_request | |
self.crawl(request, spider) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/core/engine.py", line 209, in crawl | |
self.schedule(request, spider) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/core/engine.py", line 215, in schedule | |
if not self.slot.scheduler.enqueue_request(request): | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/contrib/scrapy/schedulers/frontier.py", line 91, in enqueue_request | |
self.frontier.add_seeds([request]) | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/utils/managers.py", line 25, in add_seeds | |
frontier_seeds = [self.request_converter.to_frontier(seed) for seed in seeds] | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/contrib/scrapy/converters.py", line 22, in to_frontier | |
cb = _find_method(self.spider, cb) | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/contrib/scrapy/converters.py", line 104, in _find_method | |
raise ValueError("Function %s is not a method of: %s" % (func, obj)) | |
exceptions.ValueError: Function <function <lambda> at 0x7fb54d80b2a8> is not a method of: <DefaultSpider 'default' at 0x7fb54ce23290> | |
2016-05-31 21:08:31 [twisted] CRITICAL: Unhandled Error | |
Traceback (most recent call last): | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/commands/fetch.py", line 62, in run | |
self.crawler_process.start() | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/crawler.py", line 280, in start | |
reactor.run(installSignalHandlers=False) # blocking call | |
File "/home/khellan/venv/lib/python2.7/site-packages/twisted/internet/base.py", line 1194, in run | |
self.mainLoop() | |
File "/home/khellan/venv/lib/python2.7/site-packages/twisted/internet/base.py", line 1203, in mainLoop | |
self.runUntilCurrent() | |
--- <exception caught here> --- | |
File "/home/khellan/venv/lib/python2.7/site-packages/twisted/internet/base.py", line 825, in runUntilCurrent | |
call.func(*call.args, **call.kw) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__ | |
return self._func(*self._a, **self._kw) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/core/engine.py", line 134, in _next_request | |
self.crawl(request, spider) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/core/engine.py", line 209, in crawl | |
self.schedule(request, spider) | |
File "/home/khellan/venv/lib/python2.7/site-packages/scrapy/core/engine.py", line 215, in schedule | |
if not self.slot.scheduler.enqueue_request(request): | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/contrib/scrapy/schedulers/frontier.py", line 91, in enqueue_request | |
self.frontier.add_seeds([request]) | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/utils/managers.py", line 25, in add_seeds | |
frontier_seeds = [self.request_converter.to_frontier(seed) for seed in seeds] | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/contrib/scrapy/converters.py", line 22, in to_frontier | |
cb = _find_method(self.spider, cb) | |
File "/home/khellan/venv/lib/python2.7/site-packages/frontera/contrib/scrapy/converters.py", line 104, in _find_method | |
raise ValueError("Function %s is not a method of: %s" % (func, obj)) | |
exceptions.ValueError: Function <function <lambda> at 0x7fb54d80b2a8> is not a method of: <DefaultSpider 'default' at 0x7fb54ce23290> | |
2016-05-31 21:08:36 [scrapy] INFO: Closing spider (finished) | |
2016-05-31 21:08:36 [frontera.contrib.scrapy.schedulers.FronteraScheduler] INFO: Finishing frontier (finished) | |
2016-05-31 21:08:36 [scrapy] INFO: Dumping Scrapy stats: | |
{'finish_reason': 'finished', | |
'finish_time': datetime.datetime(2016, 5, 31, 20, 8, 36, 563629), | |
'frontera/iterations': 0, | |
'frontera/pending_requests_count': 0, | |
'log_count/CRITICAL': 1, | |
'log_count/DEBUG': 1, | |
'log_count/INFO': 9, | |
'start_time': datetime.datetime(2016, 5, 31, 20, 8, 31, 557130)} | |
2016-05-31 21:08:36 [scrapy] INFO: Spider closed (finished) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BOT_NAME = 'cb_crawl' | |
HBASE_URL = 'http://cbmasterb-002.servers.prgn.misp.co.uk:8080/raw_crawl' | |
SPIDER_MODULES = ['cb_crawl.spiders'] | |
NEWSPIDER_MODULE = 'cb_crawl.spiders' | |
ROBOTSTXT_OBEY=True | |
USER_AGENT = 'mycrawl' | |
CONCURRENT_REQUESTS=256 | |
DOWNLOAD_DELAY=0.25 | |
DOWNLOAD_TIMEOUT=60 | |
CONCURRENT_REQUESTS_PER_DOMAIN=1 | |
DEPTH_LIMIT=10 | |
REACTOR_THREADPOOL_MAXSIZE = 20 | |
COOKIES_ENABLED=False | |
RETRY_ENABLED = False | |
REDIRECT_ENABLED = False | |
AJAXCRAWL_ENABLED = True | |
SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' | |
SPIDER_MIDDLEWARES = { | |
'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, | |
'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, | |
} | |
DOWNLOADER_MIDDLEWARES = { | |
'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000, | |
} | |
ITEM_PIPELINES = { | |
'cb_crawl.pipelines.RawCrawl': 500, | |
} | |
AUTOTHROTTLE_ENABLED=True | |
AUTOTHROTTLE_START_DELAY=0.25 | |
AUTOTHROTTLE_MAX_DELAY=60 | |
AUTOTHROTTLE_DEBUG=False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment