Created
January 12, 2011 04:45
-
-
Save raliste/775704 to your computer and use it in GitHub Desktop.
Distancia euclidiana y factor de correlación de Pearson. Similaridades en sets pequeños!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import sqrt | |
critics = { | |
'Rod': { | |
'Superman': 1.0, | |
'Forest Gump': 7.0, | |
'Transformers': 2.5, | |
}, | |
'Novia': { | |
'Superman': 0.8, | |
'Forest Gump': 6.5, | |
'Transformers': 2.3, | |
'XXY': 1.0, | |
'Munich': 3.0, | |
}, | |
'Jose': { | |
'Superman': 3.5, | |
'Forest Gump': 3.0, | |
'Transformers': 2.5, | |
'XXY': 4.0, | |
'Munich': 5.0, | |
}, | |
'Andres': { | |
'Superman': 1.0, | |
'Forest Gump': 2.0, | |
'Transformers': 7.0, | |
'XXY': 1.0, | |
'Munich': 4.5, | |
} | |
} | |
# Returns a euclidian distance based similarity score for person 1 and person 2 | |
def sim_distance(prefs, person1, person2): | |
si=dict() | |
for item in prefs[person1]: | |
if item in prefs[person2]: | |
si[item]=1 | |
if len(si) == 0: | |
return 0 | |
sum_of_squares = sum( | |
[pow(prefs[person1][item]-prefs[person2][item], 2) for item in si] | |
) | |
return 1/(1+sqrt(sum_of_squares)) | |
# Returns the pearson correlation score for person 1 and person 2 | |
def sim_pearson(prefs, person1, person2): | |
si=dict() | |
for item in prefs[person1]: | |
if item in prefs[person2]: | |
si[item]=1 | |
n=len(si) | |
if n == 0: | |
return 0 | |
# Add up all the prefs | |
sum1 = sum([prefs[person1][item] for item in si]) | |
sum2 = sum([prefs[person2][item] for item in si]) | |
# Sum up the squares | |
sum1Sq = sum([pow(prefs[person1][item], 2) for item in si]) | |
sum2Sq = sum([pow(prefs[person2][item], 2) for item in si]) | |
# Sum up the products | |
pSum = sum([prefs[person1][item] * prefs[person2][item] for item in si]) | |
# Calculate pearson | |
num = pSum-(sum1*sum2/n) | |
den = sqrt( | |
(sum1Sq-pow(sum1,2)/n) * (sum2Sq-pow(sum2,2)/n)) | |
if den == 0: | |
return 0 | |
r = num/den | |
return r | |
# Returns the best maches for person from the prefs dict. | |
# It calculates the similarity score for one user against all other users. | |
# It then sorts the scores in reverse order. | |
def topMatches(prefs, person, n=10, sim_algorithm=sim_pearson): | |
scores = [ | |
(sim_algorithm(prefs, person, other), other) | |
for other in prefs if other !=person | |
] | |
# Sort the list | |
scores.sort(reverse=True) | |
return scores[0:n] | |
def getRecommendations(prefs, person, sim_algorithm=sim_pearson): | |
totals=dict() | |
simSums=dict() | |
for user in prefs: | |
if user == person: | |
continue | |
sim = sim_algorithm(prefs, person, user) | |
if sim <= 0: | |
continue | |
for item in prefs[user]: | |
# Only score for movies I haven't seen yet, which is pretty obvious | |
if item not in prefs[person] or prefs[person][item] == 0: | |
# Calculates similarity * score, so the closer a person is to me, | |
# the score will be higher. | |
totals.setdefault(item, 0) | |
totals[item] += prefs[user][item] * sim | |
# Sum of similarities | |
simSums.setdefault(item, 0) | |
simSums[item] += sim | |
rankings = [ | |
(total/simSums[item], item) for item,total in totals.items() | |
] | |
rankings.sort(reverse=True) | |
return rankings |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment