Skip to content

Instantly share code, notes, and snippets.

@misja
Created April 27, 2013 11:22
Show Gist options
  • Save misja/5472746 to your computer and use it in GitHub Desktop.
Save misja/5472746 to your computer and use it in GitHub Desktop.
Language detection with NLTK. Snippet I had up on my posterous.
from nltk.util import trigrams as nltk_trigrams
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.probability import FreqDist
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
class LangIdCorpusReader(CorpusReader):
'''
LangID corpus reader
'''
CorpusView = StreamBackedCorpusView
def _get_trigram_weight(self, line):
'''
Split a line in a trigram and its frequency count
'''
data = line.strip().split(' ')
if len(data) == 2:
return (data[1], int(data[0]))
def _read_trigram_block(self, stream):
'''
Read a block of trigram frequencies
'''
freqs = []
for i in range(20): # Read 20 lines at a time.
freqs.append(self._get_trigram_weight(stream.readline()))
return filter(lambda x: x != None, freqs)
def freqs(self, fileids=None):
'''
Return trigram frequencies for a language from the corpus
'''
return concat([self.CorpusView(path, self._read_trigram_block)
for path in self.abspaths(fileids=fileids)])
class LangDetect(object):
language_trigrams = {}
langid = LazyCorpusLoader('langid', LangIdCorpusReader, r'(?!\.).*\.txt')
def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
for lang in languages:
self.language_trigrams[lang] = FreqDist()
for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
self.language_trigrams[lang].inc(f[0], f[1])
def detect(self, text):
'''
Detect the text's language
'''
words = nltk_word_tokenize(text.lower())
trigrams = {}
scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
for match in words:
for trigram in self.get_word_trigrams(match):
if not trigram in trigrams.keys():
trigrams[trigram] = 0
trigrams[trigram] += 1
total = sum(trigrams.values())
for trigram, count in trigrams.items():
for lang, frequencies in self.language_trigrams.items():
# normalize and add to the total score
scores[lang] += (float(frequencies[trigram]) / float(frequencies.N())) * (float(count) / float(total))
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]
def get_word_trigrams(self, match):
return [''.join(trigram) for trigram in nltk_trigrams(match) if trigram != None]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment