Created
February 24, 2016 01:22
-
-
Save clasense4/6d7d32c93b5fec995e49 to your computer and use it in GitHub Desktop.
Scrapy Regex Unicode Error, got no solution for this
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[root@cls rss_crawler]# scrapy crawl news | |
http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html | |
2016-02-23 20:15:06 [scrapy] INFO: Scrapy 1.0.5 started (bot: rss_crawler) | |
2016-02-23 20:15:06 [scrapy] INFO: Optional features available: ssl, http11, boto | |
2016-02-23 20:15:06 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'rss_crawler.spiders', 'SPIDER_MODULES': ['rss_crawler.spiders'], 'BOT_NAME': 'rss_crawler'} | |
2016-02-23 20:15:06 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, LogStats, CoreStats, SpiderState | |
2016-02-23 20:15:06 [boto] DEBUG: Retrieving credentials from metadata server. | |
2016-02-23 20:15:06 [boto] ERROR: Caught exception reading instance data | |
Traceback (most recent call last): | |
File "/usr/lib/python2.7/site-packages/boto/utils.py", line 210, in retry_url | |
r = opener.open(req, timeout=timeout) | |
File "/usr/lib64/python2.7/urllib2.py", line 437, in open | |
response = meth(req, response) | |
File "/usr/lib64/python2.7/urllib2.py", line 550, in http_response | |
'http', request, response, code, msg, hdrs) | |
File "/usr/lib64/python2.7/urllib2.py", line 475, in error | |
return self._call_chain(*args) | |
File "/usr/lib64/python2.7/urllib2.py", line 409, in _call_chain | |
result = func(*args) | |
File "/usr/lib64/python2.7/urllib2.py", line 558, in http_error_default | |
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) | |
HTTPError: HTTP Error 404: Not Found | |
2016-02-23 20:15:06 [boto] ERROR: Unable to read instance data, giving up | |
2016-02-23 20:15:06 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats | |
2016-02-23 20:15:06 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware | |
2016-02-23 20:15:06 [scrapy] INFO: Enabled item pipelines: ImagesPipeline, RssImagePipeline, RssCrawlerPipeline | |
2016-02-23 20:15:06 [scrapy] INFO: Spider opened | |
2016-02-23 20:15:06 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) | |
2016-02-23 20:15:06 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6025 | |
2016-02-23 20:15:08 [scrapy] DEBUG: Crawled (200) <GET http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html> (referer: None) | |
http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html | |
2016-02-23 20:15:08 [scrapy] ERROR: Spider error processing <GET http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html> (referer: None) | |
Traceback (most recent call last): | |
File "/usr/lib64/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback | |
yield next(it) | |
File "/usr/lib64/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 28, in process_spider_output | |
for x in result: | |
File "/usr/lib64/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr> | |
return (_set_referer(r) for r in result or ()) | |
File "/usr/lib64/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr> | |
return (r for r in result or () if _filter(r)) | |
File "/usr/lib64/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 54, in <genexpr> | |
return (r for r in result or () if _filter(r)) | |
File "/root/python_projects/news_parser/rss_crawler/rss_crawler/spiders/news.py", line 83, in parse | |
l.add_xpath('news_date', '//span[@class="created"]/text()').re(re.compile('((0[1-9]|[1-2][0-9]|3[0-1])\.(0[1-9]|1[0-2])\.[0-9]{4}, ((0|1)[0-9]|2[0-3]):((0|1|2|3|4|5)[0-9]))', re.UNICODE))[0] | |
AttributeError: 'NoneType' object has no attribute 're' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[root@cls ~]# scrapy shell "http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html" | |
2016-02-23 20:19:13 [scrapy] INFO: Scrapy 1.0.5 started (bot: scrapybot) | |
2016-02-23 20:19:13 [scrapy] INFO: Optional features available: ssl, http11, boto | |
2016-02-23 20:19:13 [scrapy] INFO: Overridden settings: {'LOGSTATS_INTERVAL': 0, 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'} | |
2016-02-23 20:19:13 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, CoreStats, SpiderState | |
2016-02-23 20:19:13 [boto] DEBUG: Retrieving credentials from metadata server. | |
2016-02-23 20:19:13 [boto] ERROR: Caught exception reading instance data | |
Traceback (most recent call last): | |
File "/usr/lib/python2.7/site-packages/boto/utils.py", line 210, in retry_url | |
r = opener.open(req, timeout=timeout) | |
File "/usr/lib64/python2.7/urllib2.py", line 437, in open | |
response = meth(req, response) | |
File "/usr/lib64/python2.7/urllib2.py", line 550, in http_response | |
'http', request, response, code, msg, hdrs) | |
File "/usr/lib64/python2.7/urllib2.py", line 475, in error | |
return self._call_chain(*args) | |
File "/usr/lib64/python2.7/urllib2.py", line 409, in _call_chain | |
result = func(*args) | |
File "/usr/lib64/python2.7/urllib2.py", line 558, in http_error_default | |
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) | |
HTTPError: HTTP Error 404: Not Found | |
2016-02-23 20:19:13 [boto] ERROR: Unable to read instance data, giving up | |
2016-02-23 20:19:14 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats | |
2016-02-23 20:19:14 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware | |
2016-02-23 20:19:14 [scrapy] INFO: Enabled item pipelines: | |
2016-02-23 20:19:14 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024 | |
2016-02-23 20:19:14 [scrapy] INFO: Spider opened | |
2016-02-23 20:19:15 [scrapy] DEBUG: Crawled (200) <GET http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html> (referer: None) | |
[s] Available Scrapy objects: | |
[s] crawler <scrapy.crawler.Crawler object at 0x29f2b10> | |
[s] item {} | |
[s] request <GET http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html> | |
[s] response <200 http://www.focus.de/sport/formel1/formel-1-idol-in-marburger-ausstellung-alles-dreht-sich-um-michael-schumacher-wie-es-ihm-geht-bleibt-ungewiss_id_5292575.html> | |
[s] settings <scrapy.settings.Settings object at 0x29f2a90> | |
[s] spider <DefaultSpider 'default' at 0x7f219801c110> | |
[s] Useful shortcuts: | |
[s] shelp() Shell help (print this help) | |
[s] fetch(req_or_url) Fetch request (or URL) and update local objects | |
[s] view(response) View response in a browser | |
2016-02-23 20:19:16 [root] DEBUG: Using default logger | |
2016-02-23 20:19:16 [root] DEBUG: Using default logger | |
In [1]: import re | |
In [2]: response.xpath('//span[@class="created"]/text()').re(re.compile('((0[1-9]|[1-2][0-9]|3[0-1])\.(0[1-9]|1[0-2])\.[0-9]{4}, ((0|1)[0-9]|2[0-3]):((0|1|2|3|4|5)[0-9]))', re.UNICODE))[0] | |
Out[2]: u'18.02.2016, 17:04' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment