Created
April 16, 2019 03:40
-
-
Save krishpop/6c7da5e47acccc0037cd7bc2b2806cd7 to your computer and use it in GitHub Desktop.
Similarity metrics for Sparse Matrices
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def jaccard_metric(x, y): | |
""" | |
x: scipy.sparse CSR matrix shape (1, n) | |
y: scipy.sparse CSR matrix shape (1, n) | |
returns: jaccard similarity | |
""" | |
return x.minimum(y).sum()/x.maximum(y).sum() | |
def l2_metric(x,y): | |
""" | |
x: scipy.sparse CSR matrix shape (1, n) | |
y: scipy.sparse CSR matrix shape (1, n) | |
returns: l2 similarity | |
""" | |
from scipy.sparse import linalg | |
return -linalg.norm(x-y) | |
def cos_metric(x, y): | |
""" | |
x: scipy.sparse CSR matrix shape (1, n) | |
y: scipy.sparse CSR matrix shape (1, n) | |
returns: cosine similarity | |
""" | |
from scipy.sparse import linalg | |
return x.dot(y) / (linalg.norm(x) * linalg.norm(y)) | |
def jaccard_pdist(X, Y): | |
""" | |
X: scipy.sparse CSR matrix, shape (m1, n) | |
Y: scipy.sparse CSR matrix, shape (m2, n) | |
returns: pairwise jaccard distance between X and Y, shape (m1, m2) | |
""" | |
m2 = Y.shape[0] | |
d = [] | |
Y_idx = np.repeat(0, m2) | |
for i in range(m2): | |
num = np.asarray(X.minimum(Y[Y_idx]).sum(1)) | |
denom = np.asarray(X.maximum(Y[Y_idx]).sum(1)) | |
d.append(num/denom) | |
Y_idx += 1 | |
return np.hstack(d) | |
def l2_pdist(X, Y): | |
""" | |
help from https://stackoverflow.com/a/37903795 | |
X: scipy.sparse CSR matrix, shape (m1, n) | |
Y: scipy.sparse CSR matrix, shape (m2, n) | |
returns: pairwise l2 distance between X and Y, shape (m1, m2) | |
""" | |
m2 = Y.shape[0] | |
d = [] | |
Y_idx = np.repeat(0, m2) | |
for i in range(m2): | |
d.append(-np.sqrt(np.asarray((X-Y[Y_idx]).power(2).sum(1)))) | |
Y_idx += 1 | |
return np.hstack(d) | |
def cos_pdist(X, Y): | |
""" | |
help from https://stackoverflow.com/a/43493487 | |
X: scipy.sparse CSR matrix, shape (m1, n) | |
Y: scipy.sparse CSR matrix, shape (m2, n) | |
returns: pairwise cosine distance between X and Y, shape (m1, m2) | |
""" | |
sumyy = np.asarray((Y.power(2)).sum(1)).flatten() | |
sumxx = np.asarray((X.power(2)).sum(1)) | |
sumxy = X.dot(Y.T).toarray() | |
return (sumxy/np.sqrt(sumxx))/np.sqrt(sumyy) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment