Last active
November 1, 2023 13:12
-
-
Save maximdanilchenko/050003cf3aad6cab6cb835c61c5e9ae1 to your computer and use it in GitHub Desktop.
finding PCC,CPCC,SPCC,Jaccard,MSD,JMSD,COS and ACOS similarity metrics for a and b dicts {'item1':rating1,'item2':rating2..}
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| from math import * | |
| # summ XY for 'both' common elements | |
| def scal(x,y,both): | |
| return sum(x[i]*y[i] for i in both) | |
| # summ X for 'both' common elements | |
| def one(x,both): | |
| return sum(x[i] for i in both) | |
| # calculating common elements | |
| def bothcalc(a,b): | |
| return dict([(i,1) for i in a if i in b]) | |
| # PCC - Pearson correlation coefficient | |
| def PCC(a,b): | |
| both=bothcalc(a,b) | |
| if both == {}: return 0 | |
| n = len(both) | |
| onea = one(a,a)/len(a) | |
| oneb = one(b,b)/len(b) | |
| up = sum((a[i]-onea)*(b[i]-oneb) for i in both) | |
| down = sqrt(sum((a[i]-onea)**2 for i in both)) * sqrt(sum((b[i]-oneb)**2 for i in both)) | |
| if down == 0: | |
| return 0 | |
| return up/down | |
| # CPCC - Constrained Pearson correlation coefficient | |
| def CPCC(a,b,rsm=3):#rsm - rating scale median | |
| both=bothcalc(a,b) | |
| if both == {}: return 0 | |
| up = sum((a[i]-rsm)*(b[i]-rsm) for i in both) | |
| down = sqrt(sum((a[i]-rsm)**2 for i in both)) * sqrt(sum((b[i]-rsm)**2 for i in both)) | |
| if down == 0: | |
| return 0 | |
| return up/down | |
| # SPCC - Sigmoid function based Pearson coefficient | |
| def SPCC(a,b): | |
| both=bothcalc(a,b) | |
| if both == {}: return 0 | |
| return PCC(a,b) * 1/(1+exp(-len(both)/2)) | |
| # COS - Cosine measure | |
| def COS(a,b): | |
| both=bothcalc(a,b) | |
| if both == {}: return 0 | |
| return scal(a,b,both)/(sqrt(scal(a,a,a))*sqrt(scal(b,b,b))) | |
| # ACOS - Adjusted cosine measure | |
| def ACOS(a,b): | |
| al = set(a)&set(b) | |
| a2 = dict([(i,a[i]) if i in a else (i,0) for i in al]) | |
| b2 = dict([(i,b[i]) if i in b else (i,0) for i in al]) | |
| onea = one(a,a)/len(a) | |
| oneb = one(b,b)/len(b) | |
| up = sum((a2[i]-onea)*(b2[i]-oneb) for i in al) | |
| down = sqrt(sum((a2[i]-onea)**2 for i in al)) * sqrt(sum((b2[i]-oneb)**2 for i in al)) | |
| if down == 0: | |
| return 0 | |
| return up/down | |
| # Jaccard - Jaccard similarity | |
| def Jaccard(a,b): | |
| both=bothcalc(a,b) | |
| if both == {}: return 0 | |
| return len(both)/(len(a)+len(b)-len(both)) | |
| # MSD - Mean squared differences | |
| def MSD(a,b,scale=5): | |
| both=bothcalc(a,b) | |
| if both == {}: return 0 | |
| return 1 - sum(((a[i]-b[i])/scale)**2 for i in both)/len(both) | |
| # JMSD - Jaccard and MSD combined | |
| def JMSD(a,b,scale=5): | |
| return Jaccard(a,b)*MSD(a,b) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is really nice effort I have ever seen, would you please explain how can I use these functions to get similarity matrix for ML100K or similar dataset..? Thanks in advance