FrankGrimm · November 10, 2020 08:52
diff --git a/bernoulli.py b/bernoulli.py
 from collections import defaultdict
 import numpy as np

 choices = defaultdict(int)

 # sentence = [["[SOS]", "lorem"], ["lorem", "ipsum"], ["ipsum", "dolor"], ...]
 # [SOS] "lorem" [EOS]
 #
 #     0       1       2       3           4       5       6
 # "[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"
 #
 # 0_[SOS], 1_lorem, 2_ipsum,
 #
 # -----
 #
 #     0       1           2       3       4       5       6       7 ...
 # ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
 #
 #     -3       -2       -1
 # ["[SOS]", "lorem", "ipsum", <sample>
 #
 #                     -2         -1   |    i    |   +1      +2
 # ["[SOS]", "lorem", "ipsum", "dolor" | "silir" | "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
 #                                         ^--- current
 #
 #
 # {[SOS], [EOS], [PAD], lorem, ipsum, dolor, silir, amet}

 unigrams = defaultdict(int)

 sentence = ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"]

 for token in sentence:
    unigrams[token] += 1

 print(unigrams)

 bigrams = defaultdict(int)


 # corpus:
 #     - doc_0
 #     - doc_1
 #     - doc_2
 #     - doc_3
 #     - doc_4
 #
 #     ------------ freeze and evaluate
 #     [UNK]
 #     - doc_5
 #     - doc_6


 class Vocabulary:
    def __init__(self):
        self._frozen = False
        self.id2token = {}
        self.token2id = {}

    @property
    def frozen(self):
        return self._frozen

    def freeze(self):
        self._frozen = True

    def add(self, token):
        if self.frozen:
            raise Exception("cannot modify frozen vocabulary")
        if token in self.token2id:
            return False
        newid = len(self.token2id)
        self.token2id[token] = newid
        self.id2token[newid] = token

    def get(self, token):
        if token in self.token2id:
            return self.token2id[token]
        return None

 import string
 corpus = "./snlp_worksheets/worksheet1/corpus.txt"
 vocab = Vocabulary()

 with open(corpus, "rt") as infile:
    for line in infile:
        line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
        sentence = line.split(" ")

        for i in range(1, len(sentence)):
            bigram = "_".join(sentence[i-1:i+1])
            vocab.add(bigram)

 vocab.freeze()

 import numpy as np

 bigram_counts = defaultdict(int)
 counts = np.zeros((len(vocab.id2token), ))
 print(counts)

 with open(corpus, "rt") as infile:
    for line in infile:
        line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
        sentence = line.split(" ")
        for i in range(1, len(sentence)):
            bigram = "_".join(sentence[i-1:i+1])

            bigram_index = vocab.get(bigram)
            if bigram_index is not None:
                counts[bigram_index] += 1

            bigram_counts[bigram] += 1

 print(counts)
 print(counts[counts==1].sum())

 for bigram_id, bigram in vocab.id2token.items():
    print(bigram_id, bigram, "count:", counts[bigram_id])

    if bigram_id > 100:
        break

 print(len(vocab.id2token))
	from collections import defaultdict
	import numpy as np

	choices = defaultdict(int)

	# sentence = [["[SOS]", "lorem"], ["lorem", "ipsum"], ["ipsum", "dolor"], ...]
	# [SOS] "lorem" [EOS]
	#
	# 0 1 2 3 4 5 6
	# "[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"
	#
	# 0_[SOS], 1_lorem, 2_ipsum,
	#
	# -----
	#
	# 0 1 2 3 4 5 6 7 ...
	# ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
	#
	# -3 -2 -1
	# ["[SOS]", "lorem", "ipsum", <sample>
	#
	# -2 -1 \| i \| +1 +2
	# ["[SOS]", "lorem", "ipsum", "dolor" \| "silir" \| "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
	# ^--- current
	#
	#
	# {[SOS], [EOS], [PAD], lorem, ipsum, dolor, silir, amet}

	unigrams = defaultdict(int)

	sentence = ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"]

	for token in sentence:
	unigrams[token] += 1

	print(unigrams)

	bigrams = defaultdict(int)


	# corpus:
	# - doc_0
	# - doc_1
	# - doc_2
	# - doc_3
	# - doc_4
	#
	# ------------ freeze and evaluate
	# [UNK]
	# - doc_5
	# - doc_6


	class Vocabulary:
	def __init__(self):
	self._frozen = False
	self.id2token = {}
	self.token2id = {}

	@property
	def frozen(self):
	return self._frozen

	def freeze(self):
	self._frozen = True

	def add(self, token):
	if self.frozen:
	raise Exception("cannot modify frozen vocabulary")
	if token in self.token2id:
	return False
	newid = len(self.token2id)
	self.token2id[token] = newid
	self.id2token[newid] = token

	def get(self, token):
	if token in self.token2id:
	return self.token2id[token]
	return None

	import string
	corpus = "./snlp_worksheets/worksheet1/corpus.txt"
	vocab = Vocabulary()

	with open(corpus, "rt") as infile:
	for line in infile:
	line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
	sentence = line.split(" ")

	for i in range(1, len(sentence)):
	bigram = "_".join(sentence[i-1:i+1])
	vocab.add(bigram)

	vocab.freeze()

	import numpy as np

	bigram_counts = defaultdict(int)
	counts = np.zeros((len(vocab.id2token), ))
	print(counts)

	with open(corpus, "rt") as infile:
	for line in infile:
	line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
	sentence = line.split(" ")
	for i in range(1, len(sentence)):
	bigram = "_".join(sentence[i-1:i+1])

	bigram_index = vocab.get(bigram)
	if bigram_index is not None:
	counts[bigram_index] += 1

	bigram_counts[bigram] += 1

	print(counts)
	print(counts[counts==1].sum())

	for bigram_id, bigram in vocab.id2token.items():
	print(bigram_id, bigram, "count:", counts[bigram_id])

	if bigram_id > 100:
	break

	print(len(vocab.id2token))