Skip to content

Instantly share code, notes, and snippets.

@shaldengeki
Created September 14, 2012 20:37
Show Gist options
  • Save shaldengeki/3724630 to your computer and use it in GitHub Desktop.
Save shaldengeki/3724630 to your computer and use it in GitHub Desktop.
user similarities map reduce job. this doesn't work!
from mrjob.job import MRJob
# Put metrics here.
# add metrics to the below function.
def calculate_metrics(v1, v2):
'''
Calculates similarity metrics between two unfilled vectors.
Returns [metric1, metric2, ...]
'''
return [some,metrics,go,here]
class MRUserSimilarities(MRJob):
def __init__(self, *args, **kwargs):
super(MRUserSimilarities, self).__init__(*args, **kwargs)
self.postCounts = {}
self.metrics = {}
def map_identity(self, key, value):
yield key, value
def return_userIDs(self):
for userID in self.postCounts:
yield (userID, self.postCounts)
def return_metric_userIDs(self):
for userID in self.metrics:
yield (userID, self.metrics[userID])
def get_postcounts(self, key, line):
'''
Takes file input of the form userID\ttopicID\tpostCount
Returns (userID, (topicID, postCount))
'''
splitLine = line.split()
yield int(splitLine[0]), (int(splitLine[1]), int(splitLine[2]))
def final_get_postcounts(self):
for userID, postDict in self.postCounts.iteritems():
yield userID, postDict
def assemble_user_postcounts(self, userID, values):
'''
Takes userID as key and a list of (topicID, postCount) tuples as values
Sets the global userID:topicID:postCount dict values.
Returns userID, topicID:postCount dict.
'''
postDict = dict(values)
if userID not in self.postCounts:
self.postCounts[userID] = postDict
else:
for topicID in postDict:
self.postCounts[userID][topicID] = postDict[topicID]
def get_similarities(self, userID, postCounts):
'''
Takes userID and that userID's topicID:postCount dict
Returns userID pairs with similarity metrics.
'''
userIDs = [key for key in postCounts if key > userID]
for userID2 in userIDs:
metrics = calculate_metrics(postCounts[userID], postCounts[userID2])
if userID not in self.metrics:
self.metrics[userID] = {userID2: metrics}
else:
self.metrics[userID][userID2] = metrics
def output_similarities(self, userID, userMetrics):
'''
Takes a userID and list of similarities
Returns outputtable values.
'''
for userID2 in userMetrics:
yield userID, [userID2] + userMetrics[userID2]
def steps(self):
return [self.mr(mapper=self.get_postcounts,
mapper_final=self.final_get_postcounts,
reducer=self.assemble_user_postcounts,
reducer_final=self.return_userIDs),
self.mr(mapper=self.get_similarities,
mapper_final=self.return_metric_userIDs,
reducer=self.output_similarities)]
if __name__ == '__main__':
MRUserSimilarities.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment