Skip to content

Instantly share code, notes, and snippets.

@nikhgupta
Created November 23, 2017 11:11
Show Gist options
  • Save nikhgupta/0880745572cf3add1f7f1deef2d94f4d to your computer and use it in GitHub Desktop.
Save nikhgupta/0880745572cf3add1f7f1deef2d94f4d to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class GoodreadsSpider(CrawlSpider):
name = 'goodreads'
allowed_domains = ['goodreads.com']
start_urls = ['https://www.goodreads.com/quotes']
rules = (
Rule(LinkExtractor(allow='quotes\/tag\/.*'), callback='parse_quotes', follow=True),
Rule(LinkExtractor(allow='quotes\?page=\d+'), callback='parse_quotes', follow=True),
)
def extract_prop(self, doc, selector, prop=None):
prop = 'attr(%s)' % prop if prop else 'text'
prop = doc.css('%s::%s' % (selector, prop)).extract_first()
if prop: return prop.strip()
def parse_quotes(self, response):
items = []
for quote in response.css(".quotes .quote, .leftContainer .quote"):
item = {}
item['url'] = response.url
item['text'] = self.extract_prop(quote, '.quoteText')
item['image'] = self.extract_prop(quote, '.quoteDetails img', 'src')
item['author'] = self.extract_prop(quote, '.quoteText a.authorOrTitle')
item['tags'] = quote.css('.quoteFooter .left a::text').extract()
item['likes'] = int(quote.css('.quoteFooter .right a::text').re_first('\d+'))
item['book'] = self.extract_prop(quote, '.quoteText span a.authorOrTitle')
item['source'] = 'goodreads'
items.append(item)
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment