Last active
August 29, 2015 14:01
-
-
Save tgittos/869d09f147c4d79e26d8 to your computer and use it in GitHub Desktop.
deviantArt RSS scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
import feedparser | |
import urllib | |
import os | |
import pickle | |
import time | |
import datetime | |
class Scraper: | |
def update(self): | |
raise Exception("Abstract method called") | |
def fetch_all(self): | |
raise Exception("Abstract method called") | |
def fetch_new(self, date): | |
raise Exception("Abstract method called") | |
class DeviantArtScraper(Scraper): | |
FEEDS_HISTORY = os.path.join(os.path.abspath('.'), "feeds.db") | |
def __init__(self): | |
self.__feed_config = self.__read_config() | |
def update(self): | |
for url in self.__feed_config: | |
print "Checking feed " + url | |
print "Feed last checked on " + datetime.datetime.fromtimestamp(time.mktime(self.__feed_config[url]['updated'])).strftime("%c") | |
self.__download(url) | |
self.__update_config(url) | |
print "" | |
self.__write_config(self.__feed_config) | |
def fetch_all(self, feed_url, destination_folder): | |
self.__feed_url = feed_url | |
self.__output = destination_folder | |
if not feed_url in self.__feed_config: | |
self.__feed_config[feed_url] = {'path': os.path.abspath(self.__output)} | |
print "Checking feed " + feed_url | |
if "updated" in self.__feed_config[feed_url]: | |
print "Feed last checked on " + datetime.datetime.fromtimestamp(time.mktime(self.__feed_config[feed_url]['updated'])).strftime("%c") | |
self.__download(self.__feed_url) | |
self.__update_config(self.__feed_url) | |
self.__write_config(self.__feed_config) | |
print "done!" | |
def __download(self, feed_url): | |
rss = feedparser.parse(feed_url) | |
print "Latest item in feed updated " + datetime.datetime.fromtimestamp(time.mktime(rss['entries'][0]['published_parsed'])).strftime("%c") | |
for entry in rss['entries']: | |
if feed_url in self.__feed_config and 'updated' in self.__feed_config[feed_url] and entry['published_parsed'] <= self.__feed_config[feed_url]['updated']: | |
print "Done processing new entries" | |
return | |
if not 'media_content' in entry: | |
print "No image URL for entry " + entry['title'] + ", skipping" | |
continue | |
url = entry['media_content'][0]['url'] | |
parts = url.split('/') | |
filename = os.path.join(self.__feed_config[self.__feed_url]['path'], parts[len(parts)-1]) | |
if not os.path.isfile(filename): | |
print "Saving " + url + " to " + filename | |
try: | |
urllib.urlretrieve(url, filename) | |
except: | |
print "Error downloading " + url | |
pass | |
else: | |
print "File " + filename + " already exists, skipping" | |
next_links = [link['href'] for link in rss['feed']['links'] if link['rel'] == "next"] | |
if len(next_links) > 0: | |
self.__download(next_links[0]) | |
def __update_config(self, feed): | |
self.__feed_config[feed]['updated'] = time.gmtime() | |
def __read_config(self): | |
config = {} | |
if os.path.isfile(self.FEEDS_HISTORY): | |
with open(self.FEEDS_HISTORY, 'r') as f: | |
config = pickle.loads(f.read()) | |
return config | |
def __write_config(self, config): | |
with open(self.FEEDS_HISTORY, 'w') as f: | |
f.write(pickle.dumps(config)) | |
da = DeviantArtScraper() | |
if (sys.argv[1] == "update"): | |
da.update() | |
else: | |
feed = sys.argv[1] | |
output = sys.argv[2] | |
da.fetch_all(feed, output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment