Skip to content

Instantly share code, notes, and snippets.

@walkingmask
Created February 24, 2017 09:13
Show Gist options
  • Select an option

  • Save walkingmask/bd78710bfe7dc3976f266bda4de5c54e to your computer and use it in GitHub Desktop.

Select an option

Save walkingmask/bd78710bfe7dc3976f266bda4de5c54e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
from collections import defaultdict
def train_unigram(file_name):
my_dict = defaultdict(int)
with open(file_name, 'r') as file_open:
for line in file_open:
words = line.replace("\n", " \n").split(" ")
for word in words:
my_dict[word] += 1
number_of_words = sum(my_dict.values())
unigram = defaultdict(lambda: 0)
for key in my_dict.keys():
unigram[key] += my_dict[key]*1.0 / number_of_words
return unigram
if __name__ == '__main__':
unigram = train_unigram(sys.argv[1])
print(unigram)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment