Skip to content

Instantly share code, notes, and snippets.

@michalmonday
Last active February 9, 2022 19:02
Show Gist options
  • Save michalmonday/0848ef79e32810654e0959f6852993ce to your computer and use it in GitHub Desktop.
Save michalmonday/0848ef79e32810654e0959f6852993ce to your computer and use it in GitHub Desktop.
Hidden Markov model simple example
##from nltk.corpus import brown
##
###tw = brown.tagged_words(tagset='universal')
##tw = brown.tagged_words()
##
##vbz = [t for t in tw if t[1] == "VBZ"]
##print('Total number of VBZ tokens:', len(vbz))
##
##is_ = [t for t in tw if t[0] == "is"]
##print('Total number of "is" words:', len(is_))
##
##is_vbz = [t for t in is_ if t[1] == "VBZ"]
##print('Total number of "is" words having "VBZ" tag:', len(is_vbz))
##
##print('First 10 "is" tags:', is_[:10])
from collections import defaultdict, Counter
import re
from nltk import bigrams
def tuplefy_corpus(corpus):
for i, sent in enumerate(corpus):
corpus[i] = re.findall(r'\s*([^/]+).(\S+)', sent)
corpus[i] = [(s[0].lower(), s[1].lower())for s in corpus[i]]
return corpus
def get_vocab(corpus):
aw = []
for sent in corpus:
for word, tag in sent:
aw.append(word)
return sorted(set(aw))
def get_pos_corpus(corpus):
pos_corpus = []
for doc in corpus:
pos_corpus.append(['<S>'] + [tagged_tok[1] for tagged_tok in doc] + ['<E>'])
return pos_corpus
corpus = tuplefy_corpus([
'Mary/N Jane/N can/M see/V Will/N',
'Spot/N will/M see/V Mary/N',
'Will/M Jane/N spot/V Mary/N',
'Mary/N will/M pat/V Spot/N'
])
pos_corpus = get_pos_corpus(corpus)
def get_pos_bigrams(pos_corpus):
pos_bigrams = []
for doc in pos_corpus:
for bg in bigrams(doc):
pos_bigrams.append(bg)
return pos_bigrams
pos_bigrams = get_pos_bigrams(pos_corpus)
vocab = get_vocab(corpus)
pos = defaultdict(list)
for s in corpus:
for word, tag in s:
pos[tag].append(word)
emission_probabilities = {word : {tag : 0.0 for tag in pos} for word in vocab}
print('Emission probabilities:')
print(f'EP(word|tag) = word_having_tag_count / total_tag_count')
for word in vocab:
for tag in pos:
word_having_tag_count = pos[tag].count(word)
total_tag_count = len(pos[tag])
prob = word_having_tag_count / total_tag_count
print(f'EP({word}|{tag}) = {word_having_tag_count} / {total_tag_count} = {prob}')
emission_probabilities[word][tag] = prob
#emission_probabilities = {word : {tag : pos[tag].count(word)/float(len(pos[tag])) for tag in pos} for word in vocab}
# accessing: emission_probabilities[word][tag]
print('\n')
transition_probabilities = {pre : {post : 0 for post in list(pos) + ['<E>']} for pre in ['<S>'] + list(pos)}
print(f'TP(tag|previous_tag) = previous_tag_followed_by_tag_count / previous_tag_count')
print(' ' + ' '.join(list(transition_probabilities.values())[0]))
for pre, val in transition_probabilities.items():
print(pre, end = ' ')
for post in val:
previous_tag_followed_by_tag_count = pos_bigrams.count((pre, post))
previous_tag_count = len(pos[pre]) if pre != '<S>' else len(corpus)
tp = previous_tag_followed_by_tag_count / previous_tag_count
print(tp, end = ' ')
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment