Created
April 2, 2012 05:11
-
-
Save jordanorelli/2280931 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import collections | |
import string | |
from string import ascii_lowercase | |
from sys import stderr | |
pair_counts = {} | |
# the pair_counts dict will have a key for each character that we've seen. The | |
# values associated to these keys will themselves be dictionaries, whose keys | |
# are also characters, such that pair_counts['a']['k'] would be the number of | |
# occurrences of the pair "ak". | |
pair_totals = {} | |
# pair_totals will also containt a key for each character that we've seen. The | |
# value for each one will be the total number of pairs we've seen that have | |
# that character as their first character. | |
pair_probabilities = {} | |
# pair_probabilities is like pair_counts, but with probabilities to see pairs | |
# instead of just their raw number of occurencess. | |
def get_probabilities(): | |
pass | |
def add_pair(left, right): | |
"""Given two characters (left and right), adds the catenation of those | |
characters to our dictionary of pairs. Returns the number of times that | |
pair has been seen so far.""" | |
if left not in pair_counts: | |
pair_counts[left] = {} | |
if left not in pair_totals: | |
pair_totals[left] = 1 | |
else: | |
pair_totals[left] += 1 | |
if right in pair_counts[left]: | |
pair_counts[left][right] += 1 | |
else: | |
pair_counts[left][right] = 1 | |
return pair_counts[left][right] | |
def tabulate_pairs(word): | |
"""Given a word, tabulates the pair of adjacent letters in them and updates | |
our pair_counts dict.""" | |
if len(word) < 2: | |
return | |
prev = word[0] | |
for i in range(1, len(word)): | |
add_pair(prev, word[i]) | |
prev = word[i] | |
def add_word(word): | |
tabulate_pairs(word) | |
get_probabilities() | |
f = open('/usr/share/dict/words', 'r') | |
for line in f: | |
word = line.strip().lower() | |
add_word(word) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment