Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Last active September 10, 2024 10:07
Show Gist options
  • Save kzinmr/441f713ddc8b85559fa79f6703fccf2e to your computer and use it in GitHub Desktop.
Save kzinmr/441f713ddc8b85559fa79f6703fccf2e to your computer and use it in GitHub Desktop.
PMI calculation
"calculate PMI(A,B)=P(A,B)/P(A)P(B) for every token A and B in a window"
from itertools import tee, combinations
from collections import Counter
def count_bigram(sentence, window=5):
# ['A','B','C','D', 'E', 'F', 'G'], 4 ->
# [['A', 'B', 'C', 'D'],
# ['B', 'C', 'D', 'E'],
# ['C', 'D', 'E', 'F'],
# ['D', 'E', 'F', 'G']]
if len(sentence) >= window:
num = len(sentence) - window + 1
else:
num = len(sentence)
window = num
d_bi = Counter()
for i, it in enumerate(tee(sentence, num)):
context_window = list(it)[i:i + window]
d_bi += Counter([tuple(sorted(bi)) for bi in combinations(context_window, 2)])
return d_bi
def build_pmi_stats(corpus, window=5):
uni_counts = Counter([token for sentence in corpus for token in sentence])
bi_counts = Counter()
for sentence in corpus:
bi_counts += count_bigram(sentence)
return uni_counts, bi_counts
class PMI:
def __init__(self, corpus):
uni_counts, bi_counts = build_pmi_stats(corpus)
self.uni_counts = uni_counts
self.bi_counts = bi_counts
def pmi(self, a, b):
return self.bi_counts[tuple(sorted([a, b]))] / (self.uni_counts[a] * self.uni_counts[b])\
if a in uni_counts and b in uni_counts else 0.
c = Counter({('A', 'B'): 1, ('A', 'C'): 1, ('A', 'D'): 1, ('B', 'C'): 2, ('B', 'D'): 2,
('C', 'D'): 3, ('B', 'E'): 1, ('C', 'E'): 2, ('D', 'E'): 3, ('C', 'F'): 1,
('D', 'F'): 2, ('E', 'F'): 2, ('D', 'G'): 1, ('E', 'G'): 1, ('F', 'G'): 1})
assert c == count_bigram(['A','B','C','D', 'E', 'F', 'G'], 4)
assert (Counter(), Counter()) == build_pmi_stats([])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment