Created
January 25, 2010 06:37
-
-
Save amundo/285672 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
from collections import defaultdict | |
from operator import itemgetter | |
""" | |
The ingredients of a language identification system | |
The simple system I've built to do language identification | |
is based on counting two-letter sequences, called | |
"bigrams." | |
So for example, the word "house" can be broken into 4 | |
bigrams: ho, ou, us, se. | |
You'll notice that there are four bigrams for that word, | |
which is five letters long, so there are "wordlength minus | |
one" bigrams (or len(word)-1 in Python). | |
More generally, for a word sequence of length n, there | |
are len(word)-n+1 bigrams. In the case of the bigrams of | |
"house", this works out to 5-2+1 = 4. | |
If we were working with trigrams (sequences of three | |
letters), there would be len(word)-3+1 trigrams per word -- | |
3 for "house". | |
""" | |
def ngrams(word, n): | |
""" | |
>>> ngrams(u"house", 2) | |
[u'ho', u'ou', u'us', u'se'] | |
""" | |
return [word[i:i+n] for i in range(len(word)-n+1) ] | |
def bigrams(word): | |
""" | |
>>> bigrams(u"house") | |
[u'ho', u'ou', u'us', u'se'] | |
""" | |
return ngrams(word,2) | |
""" | |
So given some text we have the functions we need to collect | |
the "raw material" of our model. Next, we'll need to have | |
a way to count up how many of each bigram we have in a | |
given text. | |
""" | |
def frequency(sequence): | |
fq = defaultdict(int) | |
for element in sequence: fq[element] += 1 | |
return fq | |
""" | |
If a bigram is very uncommon, it doesn't help us to | |
distinguish one language from another, and it makes the | |
model take up too much space. So, we remove all the bigrams | |
from the model which show up less than some threshhold | |
number of times. | |
""" | |
def trim_model(model, threshhold=3): | |
for bg, freq in model.items(): | |
if freq < threshhold: | |
model.pop(bg) | |
return model | |
""" | |
Now, we're ready to model a text: | |
""" | |
def model_text(text): | |
bigram_list = bigrams(text) | |
bigram_model = frequency(bigram_list) | |
return trim_model(bigram_model) | |
def inspect_model(model): | |
"""Dump the model to see what bigrams are most frequent.""" | |
by_frequency = sorted(model.items(), key=itemgetter(1)) | |
return by_frequency[-10:] | |
""" | |
Okay, time for a test: | |
""" | |
if __name__ == "__main__": | |
import sys | |
text_to_model = open(sys.argv[1], 'U').read().decode('utf-8') | |
for bigram, freq in inspect_model(model_text(text_to_model)): | |
print bigram, freq | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment