Last active
December 23, 2015 02:38
-
-
Save crosbymichael/6567864 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
from redis import Redis | |
import rethinkdb as r | |
from rq import Queue | |
import os | |
from rss import parse_feed | |
from werkzeug.contrib.atom import AtomFeed | |
import tornado.ioloop | |
import tornado.web | |
from datetime import datetime as dt | |
from hashlib import md5 | |
import json | |
url = '' | |
feed_url = '' | |
title = '' | |
def get_conns(): | |
redis = Redis(host=os.environ['REDIS_IP'], port=int(os.environ['REDIS_PORT'])) | |
conn = r.connect(os.environ['RETHINK_IP'], int(os.environ['RETHINK_PORT'])) | |
return conn, redis | |
def cron(): | |
conn, redis = get_conns() | |
q = Queue(connection=redis) | |
table = r.db('rss').table('feed') | |
# Get all the feeds and push them on the queue to be updated | |
for feed in table.run(conn): | |
q.enqueue(parse_feed, feed['id'], feed.get('etag', None), feed.get('modified', None)) | |
def web(): | |
global url, title, feed_url | |
feed_url = os.environ['FEED_URL'] | |
url = os.environ['URL'] | |
title = os.environ['TITLE'] | |
conn, redis = get_conns() | |
application = tornado.web.Application([ | |
(r"/", AtomHandler, {'conn': conn, 'redis_conn':redis}), | |
(r"/info", InfoHandler, {'conn': conn, 'redis_conn':redis}), | |
]) | |
application.listen(8888) | |
tornado.ioloop.IOLoop.instance().start() | |
def create_feeds(conn, url, feed_url): | |
result = AtomFeed(title, feed_url=feed_url, url=url) | |
entries = r.db('rss').table('entries').order_by(r.desc('modified')).limit(25).run(conn) | |
for e in entries: | |
result.add(e['title'], e['body'], content_type='html', author=e['author'], url=e['id'], id=e['uid'], updated=dt.fromtimestamp(e['updated'])) | |
return result.to_string() | |
class BaseHandler(tornado.web.RequestHandler): | |
def initialize(self, conn, redis_conn): | |
self.conn = conn | |
self.redis = redis_conn | |
class AtomHandler(BaseHandler): | |
def get(self): | |
data = self.redis.hgetall('feed') | |
if not data: | |
data = {} | |
data['blob'] = create_feeds(self.conn, url, feed_url) | |
m = md5() | |
data['modified'] = dt.now() | |
m.update(data['modified'].__str__()) | |
data['etag'] = m.hexdigest() | |
self.redis.hmset('feed', data) | |
self.set_header('Content-Type', 'application/xml') | |
self.set_header('ETag', data['etag']) | |
self.set_header('Last-Modified', data['modified']) | |
self.write(data['blob']) | |
class InfoHandler(BaseHandler): | |
def get(self): | |
data = {} | |
db = r.db('rss') | |
data['entry_count'] = db.table('entries').count().run(self.conn) | |
data['feeds'] = map(lambda f: f['id'], db.table('feed').with_fields('id').run(self.conn) ) | |
self.set_header('Content-Type', 'application/json') | |
self.write(json.dumps(data)) | |
def main(args): | |
cmd = args[len(args)-1] | |
if cmd == 'cron': | |
cron() | |
elif cmd == 'web': | |
web() | |
else: | |
sys.stderr.write('Unknown command: %s\n' % cmd) | |
sys.exit(1) | |
if __name__=='__main__': | |
main(sys.argv) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import feedparser | |
from time import mktime | |
from datetime import datetime as dt | |
import rethinkdb as r | |
import os, sys | |
from hashlib import md5 | |
from time import mktime | |
from redis import Redis | |
import calendar | |
def parse_feed(url, etag=None, modified=None): | |
""" Parse a feed url and saves them to the database """ | |
feeds = feedparser.parse(url, etag=etag, modified=modified) | |
if feeds and (feeds.get('status', -1) == 200 or feeds.get('status', -1) == 301): | |
ip = os.environ['RETHINK_IP'] | |
port = int(os.environ['RETHINK_PORT']) | |
conn = r.connect(ip, port) | |
redis = Redis(host=os.environ['REDIS_IP'], port=int(os.environ['REDIS_PORT'])) | |
entries = r.db('rss').table('entries') | |
meta = r.db('rss').table('feed') | |
data = [] | |
for entry in feeds.entries: | |
data.append(get_entry(feeds.feed.title, entry)) | |
entries.insert(data).run(conn) | |
meta.get(url).update({'etag':feeds.get('etag', None), 'modified':unix_time(dt.now())}).run(conn) | |
# remove the hash to reset the cache | |
redis.delete('feed') | |
else: | |
# should do logging here, sentry to the rescue | |
print feeds.get('status', 'no status') | |
print len(feeds.entries) | |
def get_entry(title, feed): | |
m = md5() | |
m.update(feed.link) | |
t = feed.get('updated_parsed', None) | |
if t is not None: | |
t = dt.fromtimestamp(mktime(t)) | |
else: | |
t = dt.now() | |
return { | |
'id': feed.link, | |
'uid': m.hexdigest(), | |
'author': feed.get('author', title), | |
'feed': title, | |
'title': feed.title, | |
'updated':unix_time(t), | |
'body': feed.get('summary_detail', {}).get('value', '')} | |
def unix_time(d): | |
return calendar.timegm(d.utctimetuple()) | |
if __name__=='__main__':parse_feed(sys.argv[len(sys.argv)-1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment