Last active
December 13, 2019 03:39
-
-
Save stas00/416de9b60abf8b936df58596df4735d3 to your computer and use it in GitHub Desktop.
vectorized fast implementation of PMI contextual discounting in pandas+numpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pmi_discount(df): | |
""" Turney and Pantel (2010) | |
From Frequency to Meaning: Vector Space Models of Semantics | |
arXiv:1003.1141 [cs.CL] https://arxiv.org/abs/1003.1141 | |
p. 158 "contextual discounting" extension of PMI | |
rc_min = min(rowsum, colsum) | |
l = cell / (cell + 1) * rc_min / (rc_min + 1) | |
newpmi = pmi * l | |
in: pmi pandas df | |
out: modified df | |
""" | |
colsum = df.sum(axis=0) | |
rowsum = df.sum(axis=1) | |
# broadcast rowsum into a matrix of repeated cols, and | |
# colsum into a matrix of repeated rows, | |
# then compare the 2 resulting matrices (no data copying overhead) | |
rowcol_min = np.minimum(rowsum[:, None], colsum[None, :]) | |
l = df / (df + 1) * rowcol_min / (rowcol_min + 1) | |
return df*l | |
# very very slow traversal version, but it's easier to see what's happening | |
def pmi_discount_slow(df): | |
colsum = df.sum(axis=0) | |
rowsum = df.sum(axis=1) | |
for i in range(0, df.shape[0]): | |
for j in range(0, df.shape[1]): | |
cell = df.iloc[i,j] | |
rc_min = min(rowsum[i], colsum[j]) | |
l = cell / (cell + 1) * rc_min / (rc_min + 1) | |
df.iloc[i,j] = cell*l | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment