Created
November 20, 2012 18:17
-
-
Save zthomae/4119784 to your computer and use it in GitHub Desktop.
RSS feed scraper for Frank Rich at NYMag
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from pyquery import PyQuery as pq | |
| import urllib2 | |
| from datetime import datetime | |
| from dateutil.parser import parse | |
| import PyRSS2Gen as rss | |
| from pytz import timezone | |
| import re | |
| import sys | |
| GMT = timezone('GMT') | |
| d = pq(url='http://nymag.com/frank-rich/') | |
| items = [] | |
| feature_link = d('ul#package-frich a')[0].get('href') | |
| try: | |
| feature_article = pq(urllib2.urlopen(feature_link).read()) | |
| timestamp_raw = feature_article('li.date')[0].text | |
| date = re.search(r"Published (.*)", timestamp_raw).group(1) + " 12:00:00 -0500" | |
| article_pubtime = parse(date).astimezone(GMT) | |
| title = feature_article('h2.primary')[0].text | |
| description = feature_article('h3.deck')[0].text | |
| img_url = feature_article('div#story img')[0].get('src') | |
| img = urllib2.urlopen(img_url) | |
| items.append(rss.RSSItem( | |
| title = title, | |
| link = feature_link, | |
| description = description, | |
| guid = rss.Guid(feature_link), | |
| enclosure = rss.Enclosure( | |
| url = img_url, | |
| length = img.headers['Content-Length'], | |
| type = img.headers['Content-Type'] | |
| ), | |
| pubDate = article_pubtime | |
| )) | |
| except: | |
| pass | |
| for i, post in enumerate(d('div#results-wrap article')): | |
| print "On article %d" % i | |
| article = pq(post) | |
| article_url_elem = article('header h3 a')[0] | |
| article_url = article_url_elem.get('href') | |
| try: | |
| article_raw = pq(urllib2.urlopen(article_url).read()) | |
| date = article_raw('li.timestamp')[0].text + " -0500" | |
| article_pubtime = parse(date).astimezone(GMT) | |
| except: | |
| try: | |
| date = article_raw('li.date')[0].text + " -0500" | |
| article_pubtime = parse(date).astimezone(GMT) | |
| except: | |
| date = article('header span')[0].text + " 12:00 -0500" | |
| article_pubtime = parse(date).astimezone(GMT) | |
| print date | |
| print article_pubtime | |
| title = article_url_elem.text | |
| description = article('header p')[0].text | |
| if type(description) == unicode: # Hack | |
| description = description.encode('ascii', 'ignore') | |
| img_url = article('div.image img')[0].get('src') | |
| img = urllib2.urlopen(img_url) | |
| items.append(rss.RSSItem( | |
| title = title, | |
| link = article_url, | |
| description = description, | |
| guid = rss.Guid(article_url), | |
| enclosure = rss.Enclosure( | |
| url = img_url, | |
| length = img.headers['Content-Length'], | |
| type = img.headers['Content-Type'] | |
| ), | |
| pubDate = article_pubtime | |
| )) | |
| items = sorted(items, key=lambda item: item.pubDate)[::-1][0:20] | |
| feed = rss.RSS2( | |
| title = "Frank Rich - All posts", | |
| link = 'http://nymag.com/frank-rich/', | |
| description = "Frank Rich's posts, scraped from his author page", | |
| lastBuildDate = datetime.now(), | |
| items = items | |
| ) | |
| try: | |
| feed.write_xml(open('rich-feed.xml', 'w')) | |
| except IOError, e: | |
| print "There was an error!" | |
| print e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment