Skip to content

Instantly share code, notes, and snippets.

@nikhgupta
Created November 23, 2017 11:11
Show Gist options
  • Save nikhgupta/a68130dcd424837b89ee6d161ea4cd84 to your computer and use it in GitHub Desktop.
Save nikhgupta/a68130dcd424837b89ee6d161ea4cd84 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BrainyqouteSpider(CrawlSpider):
name = 'brainyquote'
allowed_domains = ['brainyquote.com']
start_urls = ['https://www.brainyquote.com/topics']
rules = (
Rule(LinkExtractor(allow='topics\/.*'), callback='parse_quotes', follow=True),
Rule(LinkExtractor(allow='topic_index\/.*'), follow=True),
)
def extract_prop(self, doc, selector, prop=None):
prop = 'attr(%s)' % prop if prop else 'text'
prop = doc.css('%s::%s' % (selector, prop)).extract_first()
if prop: return prop.strip()
def parse_quotes(self, response):
items = []
topic = response.css('h1.quoteListH1::text').re_first('(.*) Quotes')
for quote in response.css('#quotesList .boxy'):
item = {}
item['url'] = response.url
item['text'] = self.extract_prop(quote, '.b-qt')
item['author'] = self.extract_prop(quote, '.bq-aut')
item['tags'] = quote.css('.kw-box a::text').extract() + [topic]
item['source'] = 'brainyquote'
items.append(item)
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment