Skip to content

Instantly share code, notes, and snippets.

@arvids
Created March 29, 2014 11:43
Show Gist options
  • Save arvids/9852982 to your computer and use it in GitHub Desktop.
Save arvids/9852982 to your computer and use it in GitHub Desktop.
Positive Pointwise mutual information
from scipy import sparse
import numpy
from itertools import izip, repeat
from multiprocessing import Pool
def ppmi(matrix,np,n):
if not sparse.isspmatrix_coo(matrix):
matrix = matrix.tocoo()
M,N = matrix.shape
data = matrix.data
row = matrix.row
col = matrix.col
matsum = matrix.sum()
colsum = matrix.sum(0)
rowsum = matrix.sum(1)
S = numpy.zeros(matrix.nnz)
i = 0
for r,c in izip(row,col):
S[i] = rowsum[r,0]* colsum[0,c]
i += 1
z = zip(data,row,col,S,repeat(matsum))
z = [z[i::n] for i in xrange(n)]
pool = Pool(processes=np)
matrices = pool.imap(ppmi_pool,izip(z,repeat(M),repeat(N)))
pool.close()
pool.join()
return sum(matrices)
def ppmi_pool((z,M,N)):
i = []
j = []
data = []
for d,r,c,s,m in z:
x = d*m/s
if x > 1.0:
i.append(r)
j.append(c)
data.append(numpy.log2(x))
return sparse.coo_matrix((data,(i,j)),shape=(M,N)).tocsr()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment