Created
March 26, 2013 14:00
-
-
Save SuzanaK/5245578 to your computer and use it in GitHub Desktop.
NLTK Code Snippets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Language detection for German and English, using stopwords from the python NLTK | |
import nltk | |
# removed in because it was too common | |
COMMON_STOPWORDS = set(['am', 'an', 'so', 'was', 'will']) | |
GERMAN_STOPWORDS = set(nltk.corpus.stopwords.words('german')).difference(COMMON_STOPWORDS) | |
ENGLISH_STOPWORDS = set(nltk.corpus.stopwords.words('english')).difference(COMMON_STOPWORDS) | |
# returns "english", "german" or "other", text must be a string | |
# nltk can handle unicode but not encoded utf-8 strings | |
def detect_language(sentences): | |
counter_german = 0 | |
counter_english = 0 | |
for s in sentences: | |
words = nltk.word_tokenize(s) | |
words_german = [w for w in words if w in GERMAN_STOPWORDS] | |
words_english = [w for w in words if w in ENGLISH_STOPWORDS] | |
counter_german += len(words_german) | |
counter_english += len(words_english) | |
if counter_german == 0 and counter_english == 0: | |
language = "other" | |
elif counter_german >= counter_english: | |
language = "german" | |
else: | |
language = "english" | |
return language |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
# reader is a NLTK corpus reader | |
cut = int(0.9 * len(reader.tagged_sents())) | |
trains = reader.tagged_sents()[:cut] | |
tests = reader.tagged_sents()[cut+1:] | |
tagger = nltk.tag.ClassifierBasedPOSTagger(train=trains, cutoff_prob=0.95, verbose=True) | |
tagger.evaluate(tests) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# various NLTK code snippets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
tagger = nltk.tag.stanford.POSTagger('../../tagger/stanford/models/german-fast.tagger', \ | |
'../../tagger/stanford/stanford-postagger.jar') | |
# string needs to be utf-8 encoded for this tagger model | |
tagged_sentences = [tagger.tag(nltk.word_tokenize(s.encode('utf-8'))) for s in sentences] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment