amundo · January 25, 2010 06:37
diff --git a/language_identification.py b/language_identification.py
 #!/usr/bin/env python
 # coding: utf-8
 from collections import defaultdict
 from operator import itemgetter
 """
 The ingredients of a language identification system

 The simple system I've built to do language identification
 is based on counting two-letter sequences, called
 "bigrams."

 So for example, the word "house" can be broken into 4
 bigrams: ho, ou, us, se.

 You'll notice that there are four bigrams for that word,
 which is five letters long, so there are "wordlength minus
 one" bigrams (or len(word)-1 in Python).

 More generally, for a word sequence of length n, there
 are len(word)-n+1 bigrams.  In the case of the bigrams of
 "house", this works out to 5-2+1 = 4.

 If we were working with trigrams (sequences of three
 letters), there would be len(word)-3+1 trigrams per word --
 3 for "house".

 """

 def ngrams(word, n):
  """
  >>> ngrams(u"house", 2)
  [u'ho', u'ou', u'us', u'se']
  """
  return [word[i:i+n] for i in range(len(word)-n+1) ]

 def bigrams(word):
  """
  >>> bigrams(u"house")
  [u'ho', u'ou', u'us', u'se']
  """
  return ngrams(word,2)

 """
 So given some text we have the functions we need to collect
 the "raw material" of our model. Next, we'll need to have
 a way to count up how many of each bigram we have in a
 given text.
 """

 def frequency(sequence):
  fq = defaultdict(int)
  for element in sequence: fq[element] += 1
  return fq
  
 """
 If a bigram is very uncommon, it doesn't help us to
 distinguish one language from another, and it makes the
 model take up too much space. So, we remove all the bigrams
 from the model which show up less than some threshhold
 number of times.
 """

 def trim_model(model, threshhold=3):
  for bg, freq in model.items():
    if freq < threshhold:
      model.pop(bg)
  return model 
  
 """
 Now, we're ready to model a text:
 """

 def model_text(text):
  bigram_list = bigrams(text)
  bigram_model = frequency(bigram_list)
  return trim_model(bigram_model)
  
 def inspect_model(model):
  """Dump the model to see what bigrams are most frequent."""
  by_frequency = sorted(model.items(), key=itemgetter(1))
  return by_frequency[-10:]
  
 """
 Okay, time for a test:
 """

 if __name__ == "__main__":
  import sys
  text_to_model = open(sys.argv[1], 'U').read().decode('utf-8')
  for bigram, freq in inspect_model(model_text(text_to_model)):
    print bigram, freq
	#!/usr/bin/env python
	# coding: utf-8
	from collections import defaultdict
	from operator import itemgetter
	"""
	The ingredients of a language identification system

	The simple system I've built to do language identification
	is based on counting two-letter sequences, called
	"bigrams."

	So for example, the word "house" can be broken into 4
	bigrams: ho, ou, us, se.

	You'll notice that there are four bigrams for that word,
	which is five letters long, so there are "wordlength minus
	one" bigrams (or len(word)-1 in Python).

	More generally, for a word sequence of length n, there
	are len(word)-n+1 bigrams. In the case of the bigrams of
	"house", this works out to 5-2+1 = 4.

	If we were working with trigrams (sequences of three
	letters), there would be len(word)-3+1 trigrams per word --
	3 for "house".

	"""

	def ngrams(word, n):
	"""
	>>> ngrams(u"house", 2)
	[u'ho', u'ou', u'us', u'se']
	"""
	return [word[i:i+n] for i in range(len(word)-n+1) ]

	def bigrams(word):
	"""
	>>> bigrams(u"house")
	[u'ho', u'ou', u'us', u'se']
	"""
	return ngrams(word,2)

	"""
	So given some text we have the functions we need to collect
	the "raw material" of our model. Next, we'll need to have
	a way to count up how many of each bigram we have in a
	given text.
	"""

	def frequency(sequence):
	fq = defaultdict(int)
	for element in sequence: fq[element] += 1
	return fq

	"""
	If a bigram is very uncommon, it doesn't help us to
	distinguish one language from another, and it makes the
	model take up too much space. So, we remove all the bigrams
	from the model which show up less than some threshhold
	number of times.
	"""

	def trim_model(model, threshhold=3):
	for bg, freq in model.items():
	if freq < threshhold:
	model.pop(bg)
	return model

	"""
	Now, we're ready to model a text:
	"""

	def model_text(text):
	bigram_list = bigrams(text)
	bigram_model = frequency(bigram_list)
	return trim_model(bigram_model)

	def inspect_model(model):
	"""Dump the model to see what bigrams are most frequent."""
	by_frequency = sorted(model.items(), key=itemgetter(1))
	return by_frequency[-10:]

	"""
	Okay, time for a test:
	"""

	if __name__ == "__main__":
	import sys
	text_to_model = open(sys.argv[1], 'U').read().decode('utf-8')
	for bigram, freq in inspect_model(model_text(text_to_model)):
	print bigram, freq