Created
December 22, 2011 11:50
-
-
Save lrei/1510028 to your computer and use it in GitHub Desktop.
Fetch RSS Feeds Using Multiple Processes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import Queue | |
import multiprocessing | |
import urllib2 | |
import feedparser | |
import socket | |
feeds = ['http://today.reuters.com/rss/topNews', | |
'http://today.reuters.com/rss/domesticNews', | |
'http://today.reuters.com/rss/worldNews', | |
'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', | |
'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', | |
'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', | |
'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', | |
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', | |
'http://www.nytimes.com/services/xml/rss/nyt/International.xml', | |
'http://news.google.com/?output=rss', | |
'http://feeds.salon.com/salon/news', | |
'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', | |
'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', | |
'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', | |
'http://rss.cnn.com/rss/edition.rss', | |
'http://rss.cnn.com/rss/edition_world.rss', | |
'http://rss.cnn.com/rss/edition_us.rss'] | |
# timeout for feed fetch (in seconds) | |
FEED_TIMEOUT = 20 | |
def fetch_urls(work_queue, results_queue): | |
'''worker function - gets feed urls from queue and parses the feed''' | |
while True: | |
#grab feed url from queue | |
try: | |
feed_url = work_queue.get(block = False) | |
except Queue.Empty: | |
# if queue is empty this will end the thread | |
break | |
# download the feed | |
feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read() | |
except urllib2.URLError, e: | |
continue # ignore this url | |
# parse the feed | |
parsed_feed = feedparser.parse(feed) | |
for e in parsed_feed.entries: | |
# get the links | |
if 'link' in e: | |
# push them into the results queue | |
results_queue.put(link) | |
def main(): | |
# create and populate the work queue with all the feed urls | |
work_queue = multiprocessing.Queue() | |
for feed in feeds: | |
work_queue.put(feed) | |
# create results queue for all the links extracted from the feeds | |
results_queue = multiprocessing.Queue() | |
# spawn a bunch of workers for fetch pass them the work queue & results queue | |
workers = [] | |
for i in range(len(feeds)): | |
worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,)) | |
worker.start() | |
workers.append(worker) | |
# wait for all the workers to finish | |
for worker in workers: | |
worker.join() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment