marcelcaraciolo · August 16, 2012 20:15
diff --git a/mapreduce_job2.py b/mapreduce_job2.py
    def pairwise_items(self, user_id, values):
        '''
        The output drops the user from the key entirely, instead it emits
        the pair of items as the key:

        19,21  2,1
        19,70  2,4
        21,70  1,4
        19,21  1,2

        This mapper is the main performance bottleneck.  One improvement
        would be to create a java Combiner to aggregate the
        outputs by key before writing to hdfs, another would be to use
        a vector format and SequenceFiles instead of streaming text
        for the matrix data.
        '''
        item_count, item_sum, ratings = values
        #print item_count, item_sum, [r for r in combinations(ratings, 2)]
        #bottleneck at combinations
        for item1, item2 in combinations(ratings, 2):
            yield (item1[0], item2[0]), \
                    (item1[1], item2[1])

    def calculate_similarity(self, pair_key, lines):
        '''
        Sum components of each corating pair across all users who rated both
        item x and item y, then calculate pairwise pearson similarity and
        corating counts.  The similarities are normalized to the [0,1] scale
        because we do a numerical sort.

        19,21   0.4,2
        21,19   0.4,2
        19,70   0.6,1
        70,19   0.6,1
        21,70   0.1,1
        70,21   0.1,1
        '''
        sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
        item_pair, co_ratings = pair_key, lines
        item_xname, item_yname = item_pair
        for item_x, item_y in lines:
            sum_xx += item_x * item_x
            sum_yy += item_y * item_y
            sum_xy += item_x * item_y
            sum_y += item_y
            sum_x += item_x
            n += 1
        similarity = normalized_correlation(n, sum_xy, sum_x, sum_y, \
                sum_xx, sum_yy)
        yield (item_xname, item_yname), (similarity, n)
	def pairwise_items(self, user_id, values):
	'''
	The output drops the user from the key entirely, instead it emits
	the pair of items as the key:

	19,21 2,1
	19,70 2,4
	21,70 1,4
	19,21 1,2

	This mapper is the main performance bottleneck. One improvement
	would be to create a java Combiner to aggregate the
	outputs by key before writing to hdfs, another would be to use
	a vector format and SequenceFiles instead of streaming text
	for the matrix data.
	'''
	item_count, item_sum, ratings = values
	#print item_count, item_sum, [r for r in combinations(ratings, 2)]
	#bottleneck at combinations
	for item1, item2 in combinations(ratings, 2):
	yield (item1[0], item2[0]), \
	(item1[1], item2[1])

	def calculate_similarity(self, pair_key, lines):
	'''
	Sum components of each corating pair across all users who rated both
	item x and item y, then calculate pairwise pearson similarity and
	corating counts. The similarities are normalized to the [0,1] scale
	because we do a numerical sort.

	19,21 0.4,2
	21,19 0.4,2
	19,70 0.6,1
	70,19 0.6,1
	21,70 0.1,1
	70,21 0.1,1
	'''
	sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
	item_pair, co_ratings = pair_key, lines
	item_xname, item_yname = item_pair
	for item_x, item_y in lines:
	sum_xx += item_x * item_x
	sum_yy += item_y * item_y
	sum_xy += item_x * item_y
	sum_y += item_y
	sum_x += item_x
	n += 1
	similarity = normalized_correlation(n, sum_xy, sum_x, sum_y, \
	sum_xx, sum_yy)
	yield (item_xname, item_yname), (similarity, n)