Last active
December 6, 2020 09:49
-
-
Save nyov/9c70e780ea80204559d6da5525228702 to your computer and use it in GitHub Desktop.
Scrapy RSSSpider using feedparser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import logging | |
import scrapy | |
import feedparser | |
class RSSSpider(scrapy.Spider): | |
name = "rss" | |
# Can pass some URLs on the commandline: | |
# $ scrapy runspider rssspider.py -a 'urls=http://some.url/,https://some.other.url' | |
def __init__(self, *args, **kwargs): | |
if kwargs.get('urls'): | |
urls = kwargs.pop('urls', []) | |
if urls: | |
self.start_urls = urls.split(',') | |
super(RSSSpider, self).__init__(*args, **kwargs) | |
def start_requests(self): | |
urls = [] | |
for url in self.start_urls: | |
urls.append(scrapy.Request(url)) | |
return urls | |
def parse_feed(self, feed): | |
""" Parse RSS/Atom feed using feedparser | |
""" | |
data = feedparser.parse(feed) | |
if data.bozo: | |
logging.error('Bozo feed data. %s: %r', | |
data.bozo_exception.__class__.__name__, | |
data.bozo_exception) | |
if (hasattr(data.bozo_exception, 'getLineNumber') and | |
hasattr(data.bozo_exception, 'getMessage')): | |
line = data.bozo_exception.getLineNumber() | |
logging.error('Line %d: %s', line, data.bozo_exception.getMessage()) | |
segment = feed.split('\n')[line-1] | |
logging.info('Body segment with error: %r', segment) | |
# could still try to return data. not necessarily completely broken | |
return None | |
return data | |
def parse(self, response): | |
# parse downloaded content with feedparser (NOT re-downloading with feedparser) | |
feed = self.parse_feed(response.body) | |
if feed: | |
# grab some feed elements | |
# - https://pythonhosted.org/feedparser/common-rss-elements.html | |
# - https://pythonhosted.org/feedparser/common-atom-elements.html | |
#ns = feed.namespaces | |
feed_title = feed.feed.title | |
feed_link = feed.feed.link | |
feed_desc = feed.feed.description | |
for entry in feed.entries: | |
# have content? | |
content = entry.get('content') | |
if content: | |
#content = content[0] | |
content = content[0]['value'] | |
item = { | |
# global feed data | |
'feed_title': feed_title, | |
'feed_link': feed_link, | |
'feed_description': feed_desc, | |
# | |
# item entry data | |
'url': response.url, | |
'link': entry.link, | |
'title': entry.title, | |
'description': entry.description, | |
#'date': entry.published, | |
#'date': entry.published_parsed, | |
'date': entry.updated_parsed, | |
# optional | |
'content': content, | |
'type': entry.get('dc_type'), | |
} | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment