Last active
December 27, 2022 07:29
-
-
Save ialexpovad/d8d15b03d59ca1bbd822ab66d5c6cc0f to your computer and use it in GitHub Desktop.
Сalculate squared euclidean distance matrices in Python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Several ways to calculate squared Euclidean Distance (Frobenius norm) matrices in Python | |
| # Import modules first: | |
| ## import numpy module | |
| import numpy as np | |
| ## import numpy linear algebra module | |
| import numpy.linalg as la | |
| ## import scipy spatial module | |
| import scipy.spatial as spt | |
| def compute_ED_npNorm(M): | |
| """ | |
| Method1: Using numpy.linalg.norm to compute the squared Euclidean Distance | |
| as norm which compute square roots, it is comparatively costly, it should be avoided if possible. | |
| """ | |
| # determine demensions of data matrix M | |
| m,n = M.shape | |
| # initialize the squared Euclidean Distanse matrix E | |
| E = np.zeros((n,n)) | |
| # iteration on upper triangle of matrix | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| E[i,j] = la.norm(M[:,i] - M[:,j])**2 | |
| E[j,i] = E[i,j] | |
| return E | |
| def compute_ED_npDot(M): | |
| """ | |
| Method 2: Using numpy.dot function to compute the squared Euclidean Distance | |
| aiming at efficiency. | |
| """ | |
| # determine demensions of data matrix M | |
| m,n = M.shape | |
| # initialize the squared Euclidean Distanse matrix E | |
| E = np.zeros((n,n)) | |
| # iteration on upper triangle of matrix | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| diff = M[:,i] - M[:,j] | |
| E[i,j] = np.dot(diff, diff) | |
| E[j,i] = E[i,j] | |
| return E | |
| def compute_ED_GramMatrix(M): | |
| """ | |
| Method 3: By pre-computing the Gram matrix G and calling dot for G = M.T*M instead of calling dot | |
| for each pair if vectors separately by using G_ij = M_i.T*M_j, then E_ij = G_ii - 2*G_ij + G_jj | |
| """ | |
| # determine demensions of data matrix M | |
| m,n = M.shape | |
| # initialize the squared Euclidean Distanse matrix E | |
| E = np.zeros((n,n)) | |
| # compute Gram matrix G | |
| G = np.dot(M.T, M) | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| # make use of |x-y|^2 = x'x + y'y - 2x'y | |
| E[i,j] = G[i,i] - 2*G[i,j] + G[j,j] | |
| E[j,i] = E[i,j] | |
| return E | |
| def compute_ED_avoidUseLoops(M): | |
| """ | |
| Method 4: Avoid using for loops | |
| Assume E = H + K - 2*G, where G is Gram matrix as in method num. 3. | |
| We find H_ij = G_ii and K_ij = G_jj H is n*n matrix whose each column correspond | |
| to the diagonal of G. K = H.T, then D = H + H.T - 2*G | |
| """ | |
| # determine demensions of data matrix M | |
| m,n = M.shape | |
| # compute Gram matrix G | |
| G = np.dot(M.T, M) | |
| # compute matrix H | |
| H = np.tile(np.diag(G), (n,1)) | |
| return H + H.T - 2*G | |
| def compute_ED_usingSciPy(M): | |
| """ | |
| Method 5: Using SciPy build-in function | |
| using submodule distance under module spatial | |
| """ | |
| V = spt.distance.pdist(M.T, 'sqeuclidean') | |
| return spt.distance.squareform(V) | |
| if __name__ == "__main__": | |
| # Random values of array in a given shape. | |
| M = np.random.rand(100,100) | |
| import time | |
| # region Method 1 | |
| start = time.time() | |
| M1 = compute_ED_npNorm(M) | |
| _M1 = np.linalg.norm(M1, ord = 'fro') | |
| end = time.time() | |
| print(end - start) | |
| # endregion | |
| # region Method 2 | |
| start = time.time() | |
| M2 = compute_ED_npDot(M) | |
| _M2 = np.linalg.norm(M2, ord = 'fro') | |
| end = time.time() | |
| print(end - start) | |
| # endregion | |
| # region Method 3 | |
| start = time.time() | |
| M3 = compute_ED_GramMatrix(M) | |
| _M3 = np.linalg.norm(M3, ord = 'fro') | |
| end = time.time() | |
| print(end - start) | |
| # endregion | |
| # region Method 4 | |
| start = time.time() | |
| M4 = compute_ED_avoidUseLoops(M) | |
| _M4 = np.linalg.norm(M4, ord = 'fro') | |
| end = time.time() | |
| print(end - start) | |
| # endregion | |
| # region Method 5 | |
| start = time.time() | |
| M5 = compute_ED_npNorm(M) | |
| _M5 = np.linalg.norm(M5, ord = 'fro') | |
| end = time.time() | |
| print(end - start) | |
| # endregion | |
| print(" Method 1: ", round(_M1, 3), '\n', | |
| "Method 2: ", round(_M2, 3),'\n', | |
| "Method 3: ", round(_M3, 3),'\n', | |
| "Method 4: ", round(_M4, 3),'\n', | |
| "Method 5: ", round(_M5, 3)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment