Created
August 19, 2015 19:41
-
-
Save nyov/399747653bc70a75a8d0 to your computer and use it in GitHub Desktop.
Voat.co Spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
import logging | |
from scrapy.utils.log import configure_logging | |
from scrapy.spiders import Spider | |
from scrapy.exceptions import CloseSpider | |
from scrapy.http import Request | |
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) | |
class VoatSpider(Spider): | |
"""Voat.co legacy-API Spider""" | |
name = 'voat' | |
allowed_domains = [ | |
'voat.co', | |
] | |
start_urls = [ | |
'https://voat.co/api/top200subverses', | |
#'https://voat.co/api/frontpage', | |
] | |
# link to thumbs | |
voat_thumbs_url = 'https://cdn.voat.co/thumbs/' | |
# api urls | |
api_endpoint = 'https://voat.co/api' | |
api_subversefrontpage = '%s/subversefrontpage?subverse=%s' % (api_endpoint, '%s') | |
def debug(self, response): | |
# DEBUG: check response in a shell | |
from scrapy.shell import inspect_response | |
inspect_response(response, self) | |
raise CloseSpider('debug stop') | |
def parse(self, response): | |
logging.info('Visited %s' % response.url) | |
return self.parse_top200subverses(response) | |
def parse_top200subverses(self, response): | |
response.selector.remove_namespaces() | |
subverses = response.xpath('/ArrayOfstring/string') | |
for idx, subverse in enumerate(subverses.re(r'.*Name: (\S+),')): | |
subverse = subverse.strip() | |
logging.info('%3d Found subverse "%s"' % (idx+1, subverse)) | |
sublink = Request(url=self.api_subversefrontpage % subverse, callback=self.parse_frontpage) | |
yield sublink | |
thumbnail_counter = 0 | |
def parse_frontpage(self, response): | |
logging.info('Visited %s [%s]' % (response.url, response.status)) | |
response.selector.remove_namespaces() | |
error = response.xpath('/Error') | |
if not not error: | |
errmsg = error.xpath('./Message/text()').extract_first() | |
logging.info('Page %s errored: %s' % (response.url, errmsg)) | |
return | |
entries = response.xpath('/ArrayOfApiMessage') | |
for idx, entry in enumerate(entries.xpath('./ApiMessage[Thumbnail[not(@nil)]]')): | |
post = entry.xpath("./Title/text()").extract_first() | |
if not post: | |
post = '' | |
thumb = entry.xpath("./Thumbnail[not(@nil)]/text()").extract_first() | |
if thumb: | |
logging.info('%4d Found Entry %s with Thumbnail image: %s%s' % (idx, post, self.voat_thumbs_url, thumb)) | |
self.thumbnail_counter += 1 | |
#self.debug(response) | |
def closed(self, reason): | |
logging.info('*** Found %d thumbnail images total. ***' % self.thumbnail_counter) | |
if __name__ == "__main__": | |
### for scrapy 1.0 ### | |
from twisted.internet import reactor | |
from scrapy.crawler import CrawlerRunner, Crawler | |
#from scrapy.settings import Settings | |
from scrapy.utils.project import get_project_settings | |
settings = get_project_settings() | |
settings.setdict({ | |
# disable some cruft | |
'EXTENSIONS': { | |
'scrapy.telnet.TelnetConsole': None, | |
}, | |
'DOWNLOAD_HANDLERS': {'s3': None}, | |
# config | |
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0', | |
'AUTOTHROTTLE_ENABLED': True, | |
'CONCURRENT_REQUESTS_PER_DOMAIN': '1', | |
'DOWNLOAD_DELAY': 3, # 3 secs delay | |
'RETRY_ENABLED': False, # dont retry any errors now | |
#'COOKIES_ENABLED': False, | |
# a dumb cache that drops every visited page to /tmp/ | |
# so script re-runs are from disk. | |
'HTTPCACHE_ENABLED': True, | |
'HTTPCACHE_DIR': '/tmp/scrapy-httpcache', | |
'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.DummyPolicy', | |
}) | |
runner = CrawlerRunner(settings) | |
spider = VoatSpider() | |
d = runner.crawl(spider) | |
d.addBoth(lambda _: reactor.stop()) | |
reactor.run() # the script will block here until the spider_closed signal was sent | |
# EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment