Skip to content

Instantly share code, notes, and snippets.

View marcelcaraciolo's full-sized avatar
💭
Coding !

Marcel Caraciolo marcelcaraciolo

💭
Coding !
View GitHub Profile
@marcelcaraciolo
marcelcaraciolo / friends_recommender.py
Created August 26, 2012 13:08
friends recommender
class FriendsRecommender(MRJob):
def steps(self):
return [self.mr(self.map_input, self.count_number_of_friends),
self.mr(self.count_max_of_mutual_friends,
self.top_recommendations)]
def map_input(self, key, line):
'''
Compute a cartesian product using nested loops
@marcelcaraciolo
marcelcaraciolo / bookSimilarities.py
Created August 22, 2012 20:30
books similarities
from vectorSimilarities import VectorSimilarities
class BookSimilarities(VectorSimilarities):
def input(self, key, line):
user_id, item_id, rating = line.split(';')
yield item_id, (user_id, float(rating))
@marcelcaraciolo
marcelcaraciolo / bookSimilarities.py
Created August 22, 2012 20:30
books similarities
from vectorSimilarities import VectorSimilarities
class BookSimilarities(VectorSimilarities):
def input(self, key, line):
user_id, item_id, rating = line.split(';')
yield item_id, (user_id, float(rating))
@marcelcaraciolo
marcelcaraciolo / moviesSimilarities.py
Created August 22, 2012 20:29
moviesSimilarities
from vectorSimilarities import VectorSimilarities
class MovieSimilarities(VectorSimilarities):
def input(self, key, line):
user_id, item_id, rating = line.split('|')
yield item_id, (user_id, float(rating))
@marcelcaraciolo
marcelcaraciolo / vectorSimilarities.py
Created August 22, 2012 20:25
VectorSimilarities
class VectorSimilarities(MRJob):
def steps(self):
return [self.mr(self.input,
self.group_by_user_rating),
self.mr(None, self.count_ratings_users_freq),
self.mr(self.pairwise_items, self.calculate_similarity),
self.mr(self.calculate_ranking, self.top_similar_items)
]
272263;Harry Potter and the Goblet of Fire (Book 4);8
272786;Harry Potter and the Chamber of Secrets (Book 2);8
272786;Harry Potter and the Prisoner of Azkaban (Book 3);9
272786;Harry Potter and the Goblet of Fire (Book 4);9
272786;Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback));9
272810;Harry Potter and the Chamber of Secrets (Book 2);8
272810;Harry Potter and the Prisoner of Azkaban (Book 3);9
272810;Harry Potter and the Goblet of Fire (Book 4);10
272810;Harry Potter and the Order of the Phoenix (Book 5);10
272810;Harry Potter and the Sorcerer's Stone (Book 1);8
@marcelcaraciolo
marcelcaraciolo / moviesSimilarities.py
Created August 21, 2012 19:15
moviesSimilarities
#-*-coding: utf-8 -*-
'''
Given a dataset of movies and their ratings by different
users, how can we compute the similarity between pairs of
movies?
This module computes similarities between movies
by representing each movie as a vector of ratings and
computing similarity scores over these vectors.
#-*-coding: utf-8 -*-
'''
This module computes the number of movies rated by each
user.
'''
__author__ = 'Marcel Caraciolo <[email protected]>'
def jaccard(users_in_common, total_users1, total_users2):
'''
The Jaccard Similarity between 2 two vectors
|Intersection(A, B)| / |Union(A, B)|
'''
union = total_users1 + total_users2 - users_in_common
return (users_in_common / (float(union))) if union else 0.0
@marcelcaraciolo
marcelcaraciolo / regularized_correlation.py
Created August 21, 2012 19:11
regularized correlation
def regularized_correlation(size, dot_product, rating_sum, \
rating2sum, rating_norm_squared, rating2_norm_squared,
virtual_cont, prior_correlation):
'''
The Regularized Correlation between two vectors A, B
RegularizedCorrelation = w * ActualCorrelation + (1 - w) * PriorCorrelation
where w = # actualPairs / (# actualPairs + # virtualPairs).
'''
unregularizedCorrelation = correlation(size, dot_product, rating_sum, \