Created
September 14, 2012 20:37
-
-
Save shaldengeki/3724630 to your computer and use it in GitHub Desktop.
user similarities map reduce job. this doesn't work!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mrjob.job import MRJob | |
# Put metrics here. | |
# add metrics to the below function. | |
def calculate_metrics(v1, v2): | |
''' | |
Calculates similarity metrics between two unfilled vectors. | |
Returns [metric1, metric2, ...] | |
''' | |
return [some,metrics,go,here] | |
class MRUserSimilarities(MRJob): | |
def __init__(self, *args, **kwargs): | |
super(MRUserSimilarities, self).__init__(*args, **kwargs) | |
self.postCounts = {} | |
self.metrics = {} | |
def map_identity(self, key, value): | |
yield key, value | |
def return_userIDs(self): | |
for userID in self.postCounts: | |
yield (userID, self.postCounts) | |
def return_metric_userIDs(self): | |
for userID in self.metrics: | |
yield (userID, self.metrics[userID]) | |
def get_postcounts(self, key, line): | |
''' | |
Takes file input of the form userID\ttopicID\tpostCount | |
Returns (userID, (topicID, postCount)) | |
''' | |
splitLine = line.split() | |
yield int(splitLine[0]), (int(splitLine[1]), int(splitLine[2])) | |
def final_get_postcounts(self): | |
for userID, postDict in self.postCounts.iteritems(): | |
yield userID, postDict | |
def assemble_user_postcounts(self, userID, values): | |
''' | |
Takes userID as key and a list of (topicID, postCount) tuples as values | |
Sets the global userID:topicID:postCount dict values. | |
Returns userID, topicID:postCount dict. | |
''' | |
postDict = dict(values) | |
if userID not in self.postCounts: | |
self.postCounts[userID] = postDict | |
else: | |
for topicID in postDict: | |
self.postCounts[userID][topicID] = postDict[topicID] | |
def get_similarities(self, userID, postCounts): | |
''' | |
Takes userID and that userID's topicID:postCount dict | |
Returns userID pairs with similarity metrics. | |
''' | |
userIDs = [key for key in postCounts if key > userID] | |
for userID2 in userIDs: | |
metrics = calculate_metrics(postCounts[userID], postCounts[userID2]) | |
if userID not in self.metrics: | |
self.metrics[userID] = {userID2: metrics} | |
else: | |
self.metrics[userID][userID2] = metrics | |
def output_similarities(self, userID, userMetrics): | |
''' | |
Takes a userID and list of similarities | |
Returns outputtable values. | |
''' | |
for userID2 in userMetrics: | |
yield userID, [userID2] + userMetrics[userID2] | |
def steps(self): | |
return [self.mr(mapper=self.get_postcounts, | |
mapper_final=self.final_get_postcounts, | |
reducer=self.assemble_user_postcounts, | |
reducer_final=self.return_userIDs), | |
self.mr(mapper=self.get_similarities, | |
mapper_final=self.return_metric_userIDs, | |
reducer=self.output_similarities)] | |
if __name__ == '__main__': | |
MRUserSimilarities.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment