Skip to content

Instantly share code, notes, and snippets.

@stas00
Last active December 13, 2019 03:39
Show Gist options
  • Save stas00/416de9b60abf8b936df58596df4735d3 to your computer and use it in GitHub Desktop.
Save stas00/416de9b60abf8b936df58596df4735d3 to your computer and use it in GitHub Desktop.
vectorized fast implementation of PMI contextual discounting in pandas+numpy
def pmi_discount(df):
""" Turney and Pantel (2010)
From Frequency to Meaning: Vector Space Models of Semantics
arXiv:1003.1141 [cs.CL] https://arxiv.org/abs/1003.1141
p. 158 "contextual discounting" extension of PMI
rc_min = min(rowsum, colsum)
l = cell / (cell + 1) * rc_min / (rc_min + 1)
newpmi = pmi * l
in: pmi pandas df
out: modified df
"""
colsum = df.sum(axis=0)
rowsum = df.sum(axis=1)
# broadcast rowsum into a matrix of repeated cols, and
# colsum into a matrix of repeated rows,
# then compare the 2 resulting matrices (no data copying overhead)
rowcol_min = np.minimum(rowsum[:, None], colsum[None, :])
l = df / (df + 1) * rowcol_min / (rowcol_min + 1)
return df*l
# very very slow traversal version, but it's easier to see what's happening
def pmi_discount_slow(df):
colsum = df.sum(axis=0)
rowsum = df.sum(axis=1)
for i in range(0, df.shape[0]):
for j in range(0, df.shape[1]):
cell = df.iloc[i,j]
rc_min = min(rowsum[i], colsum[j])
l = cell / (cell + 1) * rc_min / (rc_min + 1)
df.iloc[i,j] = cell*l
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment