Skip to content

Instantly share code, notes, and snippets.

@zthomae
Created November 20, 2012 18:17
Show Gist options
  • Select an option

  • Save zthomae/4119784 to your computer and use it in GitHub Desktop.

Select an option

Save zthomae/4119784 to your computer and use it in GitHub Desktop.
RSS feed scraper for Frank Rich at NYMag
#!/usr/bin/env python
from pyquery import PyQuery as pq
import urllib2
from datetime import datetime
from dateutil.parser import parse
import PyRSS2Gen as rss
from pytz import timezone
import re
import sys
GMT = timezone('GMT')
d = pq(url='http://nymag.com/frank-rich/')
items = []
feature_link = d('ul#package-frich a')[0].get('href')
try:
feature_article = pq(urllib2.urlopen(feature_link).read())
timestamp_raw = feature_article('li.date')[0].text
date = re.search(r"Published (.*)", timestamp_raw).group(1) + " 12:00:00 -0500"
article_pubtime = parse(date).astimezone(GMT)
title = feature_article('h2.primary')[0].text
description = feature_article('h3.deck')[0].text
img_url = feature_article('div#story img')[0].get('src')
img = urllib2.urlopen(img_url)
items.append(rss.RSSItem(
title = title,
link = feature_link,
description = description,
guid = rss.Guid(feature_link),
enclosure = rss.Enclosure(
url = img_url,
length = img.headers['Content-Length'],
type = img.headers['Content-Type']
),
pubDate = article_pubtime
))
except:
pass
for i, post in enumerate(d('div#results-wrap article')):
print "On article %d" % i
article = pq(post)
article_url_elem = article('header h3 a')[0]
article_url = article_url_elem.get('href')
try:
article_raw = pq(urllib2.urlopen(article_url).read())
date = article_raw('li.timestamp')[0].text + " -0500"
article_pubtime = parse(date).astimezone(GMT)
except:
try:
date = article_raw('li.date')[0].text + " -0500"
article_pubtime = parse(date).astimezone(GMT)
except:
date = article('header span')[0].text + " 12:00 -0500"
article_pubtime = parse(date).astimezone(GMT)
print date
print article_pubtime
title = article_url_elem.text
description = article('header p')[0].text
if type(description) == unicode: # Hack
description = description.encode('ascii', 'ignore')
img_url = article('div.image img')[0].get('src')
img = urllib2.urlopen(img_url)
items.append(rss.RSSItem(
title = title,
link = article_url,
description = description,
guid = rss.Guid(article_url),
enclosure = rss.Enclosure(
url = img_url,
length = img.headers['Content-Length'],
type = img.headers['Content-Type']
),
pubDate = article_pubtime
))
items = sorted(items, key=lambda item: item.pubDate)[::-1][0:20]
feed = rss.RSS2(
title = "Frank Rich - All posts",
link = 'http://nymag.com/frank-rich/',
description = "Frank Rich's posts, scraped from his author page",
lastBuildDate = datetime.now(),
items = items
)
try:
feed.write_xml(open('rich-feed.xml', 'w'))
except IOError, e:
print "There was an error!"
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment