zthomae · November 20, 2012 18:17
diff --git a/rich-feed.py b/rich-feed.py
 #!/usr/bin/env python

 from pyquery import PyQuery as pq
 import urllib2
 from datetime import datetime
 from dateutil.parser import parse
 import PyRSS2Gen as rss
 from pytz import timezone
 import re
 import sys

 GMT = timezone('GMT')
 d = pq(url='http://nymag.com/frank-rich/')

 items = []

 feature_link = d('ul#package-frich a')[0].get('href')
 try:
    feature_article = pq(urllib2.urlopen(feature_link).read())
    timestamp_raw = feature_article('li.date')[0].text
    date = re.search(r"Published (.*)", timestamp_raw).group(1) + " 12:00:00 -0500"
    article_pubtime = parse(date).astimezone(GMT)
    title = feature_article('h2.primary')[0].text
    description = feature_article('h3.deck')[0].text
    img_url = feature_article('div#story img')[0].get('src')
    img = urllib2.urlopen(img_url)
    items.append(rss.RSSItem(
        title = title,
        link = feature_link,
        description = description,
        guid = rss.Guid(feature_link),
        enclosure = rss.Enclosure(
            url = img_url,
            length = img.headers['Content-Length'],
            type = img.headers['Content-Type']
        ),
        pubDate = article_pubtime
    ))
 except:
    pass

 for i, post in enumerate(d('div#results-wrap article')):
    print "On article %d" % i
    article = pq(post)
    article_url_elem = article('header h3 a')[0]
    article_url = article_url_elem.get('href')
    try:
        article_raw = pq(urllib2.urlopen(article_url).read())
        date = article_raw('li.timestamp')[0].text + " -0500"
        article_pubtime = parse(date).astimezone(GMT)
    except:
        try:
            date = article_raw('li.date')[0].text + " -0500"
            article_pubtime = parse(date).astimezone(GMT)
        except:
            date = article('header span')[0].text + " 12:00 -0500"
            article_pubtime = parse(date).astimezone(GMT)
    print date
    print article_pubtime
    title = article_url_elem.text
    description = article('header p')[0].text
    if type(description) == unicode: # Hack
        description = description.encode('ascii', 'ignore') 
    img_url = article('div.image img')[0].get('src')
    img = urllib2.urlopen(img_url)
    items.append(rss.RSSItem(
        title = title,
        link = article_url,
        description = description,
        guid = rss.Guid(article_url),
        enclosure = rss.Enclosure(
            url = img_url,
            length = img.headers['Content-Length'],
            type = img.headers['Content-Type']
        ),
        pubDate = article_pubtime
    ))

 items = sorted(items, key=lambda item: item.pubDate)[::-1][0:20]

 feed = rss.RSS2(
    title = "Frank Rich - All posts",
    link = 'http://nymag.com/frank-rich/',
    description = "Frank Rich's posts, scraped from his author page",
    lastBuildDate = datetime.now(),
    items = items
 )

 try:
    feed.write_xml(open('rich-feed.xml', 'w'))
 except IOError, e:
    print "There was an error!"
    print e
	#!/usr/bin/env python

	from pyquery import PyQuery as pq
	import urllib2
	from datetime import datetime
	from dateutil.parser import parse
	import PyRSS2Gen as rss
	from pytz import timezone
	import re
	import sys

	GMT = timezone('GMT')
	d = pq(url='http://nymag.com/frank-rich/')

	items = []

	feature_link = d('ul#package-frich a')[0].get('href')
	try:
	feature_article = pq(urllib2.urlopen(feature_link).read())
	timestamp_raw = feature_article('li.date')[0].text
	date = re.search(r"Published (.*)", timestamp_raw).group(1) + " 12:00:00 -0500"
	article_pubtime = parse(date).astimezone(GMT)
	title = feature_article('h2.primary')[0].text
	description = feature_article('h3.deck')[0].text
	img_url = feature_article('div#story img')[0].get('src')
	img = urllib2.urlopen(img_url)
	items.append(rss.RSSItem(
	title = title,
	link = feature_link,
	description = description,
	guid = rss.Guid(feature_link),
	enclosure = rss.Enclosure(
	url = img_url,
	length = img.headers['Content-Length'],
	type = img.headers['Content-Type']
	),
	pubDate = article_pubtime
	))
	except:
	pass

	for i, post in enumerate(d('div#results-wrap article')):
	print "On article %d" % i
	article = pq(post)
	article_url_elem = article('header h3 a')[0]
	article_url = article_url_elem.get('href')
	try:
	article_raw = pq(urllib2.urlopen(article_url).read())
	date = article_raw('li.timestamp')[0].text + " -0500"
	article_pubtime = parse(date).astimezone(GMT)
	except:
	try:
	date = article_raw('li.date')[0].text + " -0500"
	article_pubtime = parse(date).astimezone(GMT)
	except:
	date = article('header span')[0].text + " 12:00 -0500"
	article_pubtime = parse(date).astimezone(GMT)
	print date
	print article_pubtime
	title = article_url_elem.text
	description = article('header p')[0].text
	if type(description) == unicode: # Hack
	description = description.encode('ascii', 'ignore')
	img_url = article('div.image img')[0].get('src')
	img = urllib2.urlopen(img_url)
	items.append(rss.RSSItem(
	title = title,
	link = article_url,
	description = description,
	guid = rss.Guid(article_url),
	enclosure = rss.Enclosure(
	url = img_url,
	length = img.headers['Content-Length'],
	type = img.headers['Content-Type']
	),
	pubDate = article_pubtime
	))

	items = sorted(items, key=lambda item: item.pubDate)[::-1][0:20]

	feed = rss.RSS2(
	title = "Frank Rich - All posts",
	link = 'http://nymag.com/frank-rich/',
	description = "Frank Rich's posts, scraped from his author page",
	lastBuildDate = datetime.now(),
	items = items
	)

	try:
	feed.write_xml(open('rich-feed.xml', 'w'))
	except IOError, e:
	print "There was an error!"
	print e
No results found