Created
November 10, 2020 08:52
-
-
Save FrankGrimm/1ac8cfbb0cd035692f319c492757a885 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import numpy as np | |
choices = defaultdict(int) | |
# sentence = [["[SOS]", "lorem"], ["lorem", "ipsum"], ["ipsum", "dolor"], ...] | |
# [SOS] "lorem" [EOS] | |
# | |
# 0 1 2 3 4 5 6 | |
# "[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]" | |
# | |
# 0_[SOS], 1_lorem, 2_ipsum, | |
# | |
# ----- | |
# | |
# 0 1 2 3 4 5 6 7 ... | |
# ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"] | |
# | |
# -3 -2 -1 | |
# ["[SOS]", "lorem", "ipsum", <sample> | |
# | |
# -2 -1 | i | +1 +2 | |
# ["[SOS]", "lorem", "ipsum", "dolor" | "silir" | "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"] | |
# ^--- current | |
# | |
# | |
# {[SOS], [EOS], [PAD], lorem, ipsum, dolor, silir, amet} | |
unigrams = defaultdict(int) | |
sentence = ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"] | |
for token in sentence: | |
unigrams[token] += 1 | |
print(unigrams) | |
bigrams = defaultdict(int) | |
# corpus: | |
# - doc_0 | |
# - doc_1 | |
# - doc_2 | |
# - doc_3 | |
# - doc_4 | |
# | |
# ------------ freeze and evaluate | |
# [UNK] | |
# - doc_5 | |
# - doc_6 | |
class Vocabulary: | |
def __init__(self): | |
self._frozen = False | |
self.id2token = {} | |
self.token2id = {} | |
@property | |
def frozen(self): | |
return self._frozen | |
def freeze(self): | |
self._frozen = True | |
def add(self, token): | |
if self.frozen: | |
raise Exception("cannot modify frozen vocabulary") | |
if token in self.token2id: | |
return False | |
newid = len(self.token2id) | |
self.token2id[token] = newid | |
self.id2token[newid] = token | |
def get(self, token): | |
if token in self.token2id: | |
return self.token2id[token] | |
return None | |
import string | |
corpus = "./snlp_worksheets/worksheet1/corpus.txt" | |
vocab = Vocabulary() | |
with open(corpus, "rt") as infile: | |
for line in infile: | |
line = "".join([c for c in line.strip() if c not in string.punctuation]).lower() | |
sentence = line.split(" ") | |
for i in range(1, len(sentence)): | |
bigram = "_".join(sentence[i-1:i+1]) | |
vocab.add(bigram) | |
vocab.freeze() | |
import numpy as np | |
bigram_counts = defaultdict(int) | |
counts = np.zeros((len(vocab.id2token), )) | |
print(counts) | |
with open(corpus, "rt") as infile: | |
for line in infile: | |
line = "".join([c for c in line.strip() if c not in string.punctuation]).lower() | |
sentence = line.split(" ") | |
for i in range(1, len(sentence)): | |
bigram = "_".join(sentence[i-1:i+1]) | |
bigram_index = vocab.get(bigram) | |
if bigram_index is not None: | |
counts[bigram_index] += 1 | |
bigram_counts[bigram] += 1 | |
print(counts) | |
print(counts[counts==1].sum()) | |
for bigram_id, bigram in vocab.id2token.items(): | |
print(bigram_id, bigram, "count:", counts[bigram_id]) | |
if bigram_id > 100: | |
break | |
print(len(vocab.id2token)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment