Skip to content

Instantly share code, notes, and snippets.

@marcelcaraciolo
Created August 22, 2012 20:25
Show Gist options
  • Save marcelcaraciolo/3429048 to your computer and use it in GitHub Desktop.
Save marcelcaraciolo/3429048 to your computer and use it in GitHub Desktop.
VectorSimilarities
class VectorSimilarities(MRJob):
def steps(self):
return [self.mr(self.input,
self.group_by_user_rating),
self.mr(None, self.count_ratings_users_freq),
self.mr(self.pairwise_items, self.calculate_similarity),
self.mr(self.calculate_ranking, self.top_similar_items)
]
def configure_options(self):
super(VectorSimilarities, self).configure_options()
self.add_passthrough_option(
'--priorcount', dest='prior_count', default=10, type='int',
help='PRIOR_COUNT: Parameter to regularize correlation')
self.add_passthrough_option(
'--priorcorrelation', dest='prior_correlation', default=0,
type='int',
help='PRIOR_CORRELATION: Parameter to regularize correlation')
self.add_passthrough_option(
'--minraters', dest='min_num_raters', default=3, type='int',
help='the minimum number of raters')
self.add_passthrough_option(
'--maxraters', dest='max_num_raters', default=10000, type='int',
help='the maximum number of raters')
self.add_passthrough_option(
'--minintersec', dest='min_intersection', default=0, type='int',
help='the minimum intersection')
def input(self, key, line):
'''
Subclasses should override this to define their own input
'''
raise NotImplementedError('Implement this in the subclass')
...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment