arthurnn · December 2, 2011 22:56
diff --git a/kik.py b/kik.py
 #!/usr/bin/env python
 import threading,thread
 from lxml import etree, html
 import urllib
 from StringIO import StringIO

 # mapper function
 def mapper(document):
 	words = document.split()
 	for w in words:
 		#filter the string
 		w = w.replace("'","").replace('"','')
 		yield w, 1

 #reduce function is just a sum up function 
 reduce = sum

 # global words counter
 _words_count = dict()

 class MapReduceThread(threading.Thread):
    def __init__(self, link):
 		threading.Thread.__init__(self)
 		self.link = link

    def run(self):
 		words_count = dict()
 		page = html.parse(self.link)
 	
 		# xpath to get body each paragraph body inside the div which has the id=storybody
 		content = page.xpath("//div[@id='storybody']/p/text()")
 		
 		for doc in content:
 			#map each document(mapper implemented by each document! so we can thread this map and trigger the reduce afterwards)
 			map = mapper(doc)
 			for number,weight in map:
 				# set default return the number in the dictionary, and if the key is not there create a empty list()
 				words_count.setdefault(number, list()).append(weight)
 				
 		a_lock = thread.allocate_lock()
 		
 		with a_lock:
 			#reduce
 			for word, group in words_count.items():
 				# try to get the number of that word! if is not there get 0
 				n = _words_count.setdefault(word, 0)
 				_words_count[word] = n + reduce(group)
 		

 def main():
 	# feed URL
 	URL = 'http://rss.cbc.ca/lineup/topstories.xml'
 	
 	# open the feed
 	file = urllib.urlopen(URL)
 	sio = StringIO(file.read())
 	
 	# parse the content into a XML tree
 	tree = etree.parse(sio)

 	# xpath to find all links in the feed
 	arr = tree.xpath('/rss/channel/item/link')

 	for rr in arr:
 		threads = []
 		try:
 			thread1 = MapReduceThread(rr.text)
 			thread1.start()
 			#thread1.join()
 			threads.append(thread1)
 		except:
 		   print "Error: unable to start thread"
 		
 	#wait for MapReduce in all documents
 	for t in threads:
 		t.join()

 	#sort and get the first 50
 	words_arr = sorted(_words_count, key=_words_count.__getitem__, reverse=True)[:50]

 	#display them
 	count = 1
 	for w in words_arr:
 		print '%d %d %s'  % (count,_words_count[w],w)
 		count += 1


 		
 if __name__ == '__main__':
 	main()
	#!/usr/bin/env python
	import threading,thread
	from lxml import etree, html
	import urllib
	from StringIO import StringIO

	# mapper function
	def mapper(document):
	words = document.split()
	for w in words:
	#filter the string
	w = w.replace("'","").replace('"','')
	yield w, 1

	#reduce function is just a sum up function
	reduce = sum

	# global words counter
	_words_count = dict()

	class MapReduceThread(threading.Thread):
	def __init__(self, link):
	threading.Thread.__init__(self)
	self.link = link

	def run(self):
	words_count = dict()
	page = html.parse(self.link)

	# xpath to get body each paragraph body inside the div which has the id=storybody
	content = page.xpath("//div[@id='storybody']/p/text()")

	for doc in content:
	#map each document(mapper implemented by each document! so we can thread this map and trigger the reduce afterwards)
	map = mapper(doc)
	for number,weight in map:
	# set default return the number in the dictionary, and if the key is not there create a empty list()
	words_count.setdefault(number, list()).append(weight)

	a_lock = thread.allocate_lock()

	with a_lock:
	#reduce
	for word, group in words_count.items():
	# try to get the number of that word! if is not there get 0
	n = _words_count.setdefault(word, 0)
	_words_count[word] = n + reduce(group)


	def main():
	# feed URL
	URL = 'http://rss.cbc.ca/lineup/topstories.xml'

	# open the feed
	file = urllib.urlopen(URL)
	sio = StringIO(file.read())

	# parse the content into a XML tree
	tree = etree.parse(sio)

	# xpath to find all links in the feed
	arr = tree.xpath('/rss/channel/item/link')

	for rr in arr:
	threads = []
	try:
	thread1 = MapReduceThread(rr.text)
	thread1.start()
	#thread1.join()
	threads.append(thread1)
	except:
	print "Error: unable to start thread"

	#wait for MapReduce in all documents
	for t in threads:
	t.join()

	#sort and get the first 50
	words_arr = sorted(_words_count, key=_words_count.__getitem__, reverse=True)[:50]

	#display them
	count = 1
	for w in words_arr:
	print '%d %d %s' % (count,_words_count[w],w)
	count += 1



	if __name__ == '__main__':
	main()