Created
March 19, 2015 10:42
-
-
Save seagatesoft/ce49516edb34c33dea3f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from scrapy.http import Request | |
from scrapy.selector import Selector | |
from forumbot.spiders.blogs import BlogSpider | |
from forumbot.spiders.mixins.livefyre import LivefyreMixin | |
from bot_engines.utils import error | |
from forumbot.items import BlogPostLoader, AuthorLoader | |
class MashableSpider(BlogSpider, LivefyreMixin): | |
name = 'mashable.com' | |
allowed_domains = ['mashable.com'] | |
start_urls = ['http://mashable.com/stories.json?new_per_page=20&' | |
'hot_per_page=0&rising_per_page=0'] | |
story_url = 'http://mashable.com/stories.json?new_per_page=20&' \ | |
'hot_per_page=0&rising_per_page=0&new_after={prev_post_key}' | |
date_formats = ['%Y-%m-%dT%H:%M:%S'] | |
livefyre_url = 'http://bootstrap.mashable.fyre.co/bs3/v3.1/' \ | |
'mashable.fyre.co/%s/%s/init' | |
def parse(self, response): | |
return self.parse_post_list(response) | |
def parse_post_list(self, response): | |
json_data = json.loads(response.body) | |
date_data_log = [] | |
prev_post_key = '' | |
for post in json_data['new']: | |
prev_post_key = post['sort_key'] | |
date_string = post['post_date'][0:19] | |
date_data = self.parse_date(date_string, response.url, date_formats=self.date_formats) | |
if not date_data['date_obj']: | |
error("Unknown date format %s in %s" % (repr(date_string), | |
response.url)) | |
continue | |
post_date = date_data['date_obj'] | |
if not post_date: | |
continue | |
date_data_log.append(date_data) | |
if self.has_valid_date(**date_data): | |
post_item = dict( | |
item_id=post['_id'], | |
title=post['title'], | |
created_at=post_date | |
) | |
yield Request(url=post['link'], | |
meta=post_item, | |
callback=self.parse_item) | |
try: | |
limit_post_dd = date_data_log[-1] | |
except IndexError: | |
limit_post_dd = None | |
if self.is_next_page_required(limit_post_dd): | |
yield Request( | |
self.story_url.format(prev_post_key=prev_post_key), | |
callback=self.parse_post_list | |
) | |
def parse_item(self, response): | |
pl = BlogPostLoader(response=response) | |
pl.add_value('item_id', response.meta['item_id']) | |
pl.add_value('title', response.meta['title']) | |
pl.add_value('link', response.url) | |
pl.add_value('created_at', response.meta['created_at']) | |
pl.add_xpath('content', | |
'//article[@id="story"]/section[contains(' | |
'@class, "article-content")]//*[not(self::script)]/' | |
'text()') | |
al = AuthorLoader(selector=pl.selector.xpath( | |
'//div[@class="article-info"]') | |
) | |
al.add_xpath('author_id', './a/@href', re=r'/people/(.+)/') | |
# use author name in the post instead from API, they can be different | |
al.add_xpath('name', | |
'.//span[contains(., "By")]/text()', | |
re=r'By (.+)') | |
al.add_xpath('link', './a/@href') | |
al.add_xpath('avatar', './a/img[@class="author_image"]/@src') | |
post = pl.load_item() | |
post['author'] = al.load_item() | |
if 'content' in post: | |
yield post | |
yield self.make_livefyre_request(response) | |
else: | |
special_post = pl.selector.xpath( | |
'//div[@id="parsec"]/@data-post').extract() | |
if special_post: | |
json_data = json.loads(special_post[0]) | |
content = [] | |
for card in json_data['cards']: | |
if card['_type'] != 'Parsec::LongCard': | |
continue | |
for block in card['blocks']: | |
if block['_type'] == 'Parsec::TextBlock': | |
content.append(block['content']) | |
pl.add_value('content', content) | |
al.add_value_if_empty('author_id', | |
str(json_data['wp_author_id'])) | |
al.add_value_if_empty('name', json_data['author']) | |
post = pl.load_item() | |
post['author'] = al.load_item() | |
yield post | |
def get_livefyre_data(self, response): | |
sel = Selector(response) | |
site_id = ''.join( | |
sel.xpath('//div[@id="livefyre_comments"]/@data-site-id').extract() | |
) | |
article_id = ''.join( | |
sel.xpath( | |
'//div[@id="livefyre_comments"]/@data-article-id').extract() | |
) | |
return site_id, article_id |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment