Last active
October 16, 2021 20:33
-
-
Save alinazhanguwo/55b9b33d1137c0e63cc794a5de3553bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.sparse import csr_matrix | |
!pip install sparse_dot_topn | |
import sparse_dot_topn.sparse_dot_topn as ct | |
def awesome_cossim_top(A, B, ntop, lower_bound=0): | |
# force A and B as a CSR matrix. | |
# If they have already been CSR, there is no overhead | |
A = A.tocsr() | |
B = B.tocsr() | |
M, _ = A.shape | |
_, N = B.shape | |
idx_dtype = np.int32 | |
nnz_max = M*ntop | |
indptr = np.zeros(M+1, dtype=idx_dtype) | |
indices = np.zeros(nnz_max, dtype=idx_dtype) | |
data = np.zeros(nnz_max, dtype=A.dtype) | |
ct.sparse_dot_topn( | |
M, N, np.asarray(A.indptr, dtype=idx_dtype), | |
np.asarray(A.indices, dtype=idx_dtype), | |
A.data, | |
np.asarray(B.indptr, dtype=idx_dtype), | |
np.asarray(B.indices, dtype=idx_dtype), | |
B.data, | |
ntop, | |
lower_bound, | |
indptr, indices, data) | |
return csr_matrix((data,indices,indptr),shape=(M,N)) | |
import time | |
t1 = time.time() | |
# adjust lower bound: 0.8 | |
# keep top 10 similar results | |
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8) | |
t = time.time()-t1 | |
print("finished in:", t) |
What is
ntop
parameter inawesome_cossim_top(A, B, ntop, lower_bound=0
for?
hi
i.e. set ntop=0 means that keep top 10 similar results
Make sense! You mean ntop=10
*
Thanks for the clarification!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
What is
ntop
parameter inawesome_cossim_top(A, B, ntop, lower_bound=0
for?