Created
December 2, 2011 22:56
-
-
Save arthurnn/1425228 to your computer and use it in GitHub Desktop.
Script to fetch and CBC rss feed and count the words
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import threading,thread | |
from lxml import etree, html | |
import urllib | |
from StringIO import StringIO | |
# mapper function | |
def mapper(document): | |
words = document.split() | |
for w in words: | |
#filter the string | |
w = w.replace("'","").replace('"','') | |
yield w, 1 | |
#reduce function is just a sum up function | |
reduce = sum | |
# global words counter | |
_words_count = dict() | |
class MapReduceThread(threading.Thread): | |
def __init__(self, link): | |
threading.Thread.__init__(self) | |
self.link = link | |
def run(self): | |
words_count = dict() | |
page = html.parse(self.link) | |
# xpath to get body each paragraph body inside the div which has the id=storybody | |
content = page.xpath("//div[@id='storybody']/p/text()") | |
for doc in content: | |
#map each document(mapper implemented by each document! so we can thread this map and trigger the reduce afterwards) | |
map = mapper(doc) | |
for number,weight in map: | |
# set default return the number in the dictionary, and if the key is not there create a empty list() | |
words_count.setdefault(number, list()).append(weight) | |
a_lock = thread.allocate_lock() | |
with a_lock: | |
#reduce | |
for word, group in words_count.items(): | |
# try to get the number of that word! if is not there get 0 | |
n = _words_count.setdefault(word, 0) | |
_words_count[word] = n + reduce(group) | |
def main(): | |
# feed URL | |
URL = 'http://rss.cbc.ca/lineup/topstories.xml' | |
# open the feed | |
file = urllib.urlopen(URL) | |
sio = StringIO(file.read()) | |
# parse the content into a XML tree | |
tree = etree.parse(sio) | |
# xpath to find all links in the feed | |
arr = tree.xpath('/rss/channel/item/link') | |
for rr in arr: | |
threads = [] | |
try: | |
thread1 = MapReduceThread(rr.text) | |
thread1.start() | |
#thread1.join() | |
threads.append(thread1) | |
except: | |
print "Error: unable to start thread" | |
#wait for MapReduce in all documents | |
for t in threads: | |
t.join() | |
#sort and get the first 50 | |
words_arr = sorted(_words_count, key=_words_count.__getitem__, reverse=True)[:50] | |
#display them | |
count = 1 | |
for w in words_arr: | |
print '%d %d %s' % (count,_words_count[w],w) | |
count += 1 | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment