Last active
June 9, 2020 19:17
-
-
Save josepdecid/d428f915d4ae1fcd89b90616898fd3b6 to your computer and use it in GitHub Desktop.
N-gram occurences counter with MLE and LAP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List, Set, Optional | |
from fractions import Fraction | |
from collections import Counter | |
import termtables as tt | |
def count_n_grams(l: List[str], n: int, alpha: float, vocab: Optional[Set[str]] = None): | |
n_grams = [] | |
i = 0 | |
while i + n <= len(l): | |
n_grams.append(''.join(l[i:i + n])) | |
i += 1 | |
total_count = i | |
counter = Counter(n_grams) | |
most_common = counter.most_common() | |
vocab_size = len(set(l)) if vocab is None else len(vocab) | |
nb = total_count + vocab_size ** n | |
n0 = vocab_size ** n - len(counter) | |
for i in range(len(most_common)): | |
mle = Fraction(most_common[i][1], total_count) | |
mle = f'{str(mle)} ({float(mle):.5f})' | |
lap = Fraction(most_common[i][1] + 1, nb) | |
lap = f'{str(lap)} ({float(lap):.5f})' | |
ld = (1 - alpha) * most_common[i][1] / total_count if most_common[i][1] > 0 else alpha / n0 | |
ld = f'{ld:.5f}' | |
most_common[i] = (*most_common[i], mle, lap, ld) | |
most_common.append(('UNOBS', 0, 0, Fraction(1, nb), f'{alpha / n0:.5f}')) | |
tt.print(most_common, header=[f'{n}-gram', 'Occurences', 'MLE', 'LAP', 'LD']) | |
if __name__ == '__main__': | |
sentence = 'pituka tuika tatuk ku pika tuki pikata katuki tukapa' | |
# (Un)comment if case sensitive | |
# sentence = sentence.lower() | |
# Tokenize sentence (word level, char level, etc.) | |
#tokenized = sentence.split() | |
tokenized = list(sentence) | |
count_n_grams(tokenized, 3, 0.05, vocab={'a', 'i', 'u', 'p', 't', 'k', ' '}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment