SuzanaK · March 26, 2013 14:00
diff --git a/language_detection.py b/language_detection.py
 # Language detection for German and English, using stopwords from the python NLTK

 import nltk

 # removed in because it was too common
 COMMON_STOPWORDS = set(['am', 'an', 'so', 'was', 'will'])
 GERMAN_STOPWORDS = set(nltk.corpus.stopwords.words('german')).difference(COMMON_STOPWORDS)
 ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english')).difference(COMMON_STOPWORDS)
 
 # returns "english", "german" or "other", text must be a string
 # nltk can handle unicode but not encoded utf-8 strings
 
 def detect_language(sentences):
 
 counter_german = 0
 counter_english = 0
 
 for s in sentences:
 words = nltk.word_tokenize(s)
 words_german = [w for w in words if w in GERMAN_STOPWORDS]
 words_english = [w for w in words if w in ENGLISH_STOPWORDS]
 counter_german += len(words_german)
 counter_english += len(words_english)
 
 if counter_german == 0 and counter_english == 0:
 language = "other"
 
 elif counter_german >= counter_english:
 language = "german"
 
 else:
 language = "english"
 
 
 return language 
diff --git a/naive_bayes_tagger.py b/naive_bayes_tagger.py
 import nltk
 
 # reader is a NLTK corpus reader
 cut = int(0.9 * len(reader.tagged_sents()))
 trains = reader.tagged_sents()[:cut]
 tests = reader.tagged_sents()[cut+1:]
 
 tagger = nltk.tag.ClassifierBasedPOSTagger(train=trains, cutoff_prob=0.95, verbose=True)
 tagger.evaluate(tests)
diff --git a/NLTK code snippets b/NLTK code snippets
 # various NLTK code snippets
diff --git a/stanford_tagger_german.py b/stanford_tagger_german.py
 import nltk
 
 tagger = nltk.tag.stanford.POSTagger('../../tagger/stanford/models/german-fast.tagger', \
 '../../tagger/stanford/stanford-postagger.jar')
 
 # string needs to be utf-8 encoded for this tagger model
 
 tagged_sentences = [tagger.tag(nltk.word_tokenize(s.encode('utf-8'))) for s in sentences]
	# Language detection for German and English, using stopwords from the python NLTK

	import nltk

	# removed in because it was too common
	COMMON_STOPWORDS = set(['am', 'an', 'so', 'was', 'will'])
	GERMAN_STOPWORDS = set(nltk.corpus.stopwords.words('german')).difference(COMMON_STOPWORDS)
	ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english')).difference(COMMON_STOPWORDS)

	# returns "english", "german" or "other", text must be a string
	# nltk can handle unicode but not encoded utf-8 strings

	def detect_language(sentences):

	counter_german = 0
	counter_english = 0

	for s in sentences:
	words = nltk.word_tokenize(s)
	words_german = [w for w in words if w in GERMAN_STOPWORDS]
	words_english = [w for w in words if w in ENGLISH_STOPWORDS]
	counter_german += len(words_german)
	counter_english += len(words_english)

	if counter_german == 0 and counter_english == 0:
	language = "other"

	elif counter_german >= counter_english:
	language = "german"

	else:
	language = "english"


	return language
	import nltk

	# reader is a NLTK corpus reader
	cut = int(0.9 * len(reader.tagged_sents()))
	trains = reader.tagged_sents()[:cut]
	tests = reader.tagged_sents()[cut+1:]

	tagger = nltk.tag.ClassifierBasedPOSTagger(train=trains, cutoff_prob=0.95, verbose=True)
	tagger.evaluate(tests)
	import nltk

	tagger = nltk.tag.stanford.POSTagger('../../tagger/stanford/models/german-fast.tagger', \
	'../../tagger/stanford/stanford-postagger.jar')

	# string needs to be utf-8 encoded for this tagger model

	tagged_sentences = [tagger.tag(nltk.word_tokenize(s.encode('utf-8'))) for s in sentences]