Skip to content

Instantly share code, notes, and snippets.

@alvesjnr
Created June 21, 2011 20:28
Show Gist options
  • Save alvesjnr/1038810 to your computer and use it in GitHub Desktop.
Save alvesjnr/1038810 to your computer and use it in GitHub Desktop.
Second mapreduce for counting: separated blocks
#coding: utf-8
from multiprocessing.pool import ThreadPool
def mapping(value):
return [(word,1) for word in value]
def reducing(list_maps):
reduced = {}
for value in list_maps:
if value[0] in reduced:
reduced[value[0]] += value[1]
else:
reduced[value[0]] = value[1]
return [(key, reduced[key]) for key in reduced]
def cut_text(text, n):
l = len(text)
chunks = []
j=0
while j<n:
part = text[j*(l/n):(j+1)*(l/n)]
if part:
chunks.append(part)
j+=1
return chunks
if __name__=='__main__':
"""
Attention: case sensitive!!!
"""
text = open('longtext.txt').read()
text = text.split() #remove all whitespaces
l = len(text)
processes = 12
pool = ThreadPool(processes=processes)
chunks = cut_text(text,processes)
mapped = pool.map_async(mapping, chunks)
reduced = pool.map_async(reducing, mapped.get())
blah = []
for i in reduced.get():
for j in i:
blah.append(j)
print reducing(blah)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment