Skip to content

Instantly share code, notes, and snippets.

@seagatesoft
Created March 19, 2015 10:42
Show Gist options
  • Save seagatesoft/ce49516edb34c33dea3f to your computer and use it in GitHub Desktop.
Save seagatesoft/ce49516edb34c33dea3f to your computer and use it in GitHub Desktop.
import json
from scrapy.http import Request
from scrapy.selector import Selector
from forumbot.spiders.blogs import BlogSpider
from forumbot.spiders.mixins.livefyre import LivefyreMixin
from bot_engines.utils import error
from forumbot.items import BlogPostLoader, AuthorLoader
class MashableSpider(BlogSpider, LivefyreMixin):
name = 'mashable.com'
allowed_domains = ['mashable.com']
start_urls = ['http://mashable.com/stories.json?new_per_page=20&'
'hot_per_page=0&rising_per_page=0']
story_url = 'http://mashable.com/stories.json?new_per_page=20&' \
'hot_per_page=0&rising_per_page=0&new_after={prev_post_key}'
date_formats = ['%Y-%m-%dT%H:%M:%S']
livefyre_url = 'http://bootstrap.mashable.fyre.co/bs3/v3.1/' \
'mashable.fyre.co/%s/%s/init'
def parse(self, response):
return self.parse_post_list(response)
def parse_post_list(self, response):
json_data = json.loads(response.body)
date_data_log = []
prev_post_key = ''
for post in json_data['new']:
prev_post_key = post['sort_key']
date_string = post['post_date'][0:19]
date_data = self.parse_date(date_string, response.url, date_formats=self.date_formats)
if not date_data['date_obj']:
error("Unknown date format %s in %s" % (repr(date_string),
response.url))
continue
post_date = date_data['date_obj']
if not post_date:
continue
date_data_log.append(date_data)
if self.has_valid_date(**date_data):
post_item = dict(
item_id=post['_id'],
title=post['title'],
created_at=post_date
)
yield Request(url=post['link'],
meta=post_item,
callback=self.parse_item)
try:
limit_post_dd = date_data_log[-1]
except IndexError:
limit_post_dd = None
if self.is_next_page_required(limit_post_dd):
yield Request(
self.story_url.format(prev_post_key=prev_post_key),
callback=self.parse_post_list
)
def parse_item(self, response):
pl = BlogPostLoader(response=response)
pl.add_value('item_id', response.meta['item_id'])
pl.add_value('title', response.meta['title'])
pl.add_value('link', response.url)
pl.add_value('created_at', response.meta['created_at'])
pl.add_xpath('content',
'//article[@id="story"]/section[contains('
'@class, "article-content")]//*[not(self::script)]/'
'text()')
al = AuthorLoader(selector=pl.selector.xpath(
'//div[@class="article-info"]')
)
al.add_xpath('author_id', './a/@href', re=r'/people/(.+)/')
# use author name in the post instead from API, they can be different
al.add_xpath('name',
'.//span[contains(., "By")]/text()',
re=r'By (.+)')
al.add_xpath('link', './a/@href')
al.add_xpath('avatar', './a/img[@class="author_image"]/@src')
post = pl.load_item()
post['author'] = al.load_item()
if 'content' in post:
yield post
yield self.make_livefyre_request(response)
else:
special_post = pl.selector.xpath(
'//div[@id="parsec"]/@data-post').extract()
if special_post:
json_data = json.loads(special_post[0])
content = []
for card in json_data['cards']:
if card['_type'] != 'Parsec::LongCard':
continue
for block in card['blocks']:
if block['_type'] == 'Parsec::TextBlock':
content.append(block['content'])
pl.add_value('content', content)
al.add_value_if_empty('author_id',
str(json_data['wp_author_id']))
al.add_value_if_empty('name', json_data['author'])
post = pl.load_item()
post['author'] = al.load_item()
yield post
def get_livefyre_data(self, response):
sel = Selector(response)
site_id = ''.join(
sel.xpath('//div[@id="livefyre_comments"]/@data-site-id').extract()
)
article_id = ''.join(
sel.xpath(
'//div[@id="livefyre_comments"]/@data-article-id').extract()
)
return site_id, article_id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment