Created
June 24, 2010 19:49
-
-
Save anonymous/451880 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import random | |
class Categorizer(object): | |
def __init__(self, categories): | |
self.categories = categories | |
def prep_seed(self, content): | |
"""Convert seed content into nltk.Text""" | |
raw = nltk.clean_html(content) | |
tokens = nltk.word_tokenize(raw) | |
return nltk.Text(tokens) | |
def doc_prep(self, categories, output_list): | |
"""categories is a Django Queryset of categories""" | |
for c in categories: | |
seeds = c.seedarticle_set.all().iterator() | |
output_list.extend([(self.prep_seed(s.seed), c.name) for s in seeds]) | |
return output_list | |
def document_features(self, document): | |
document_words = set(document) | |
features = {} | |
for word in self.word_features: | |
features['contains(%s)' % word] = (word in document_words) | |
return features | |
def test_accuracy(self): | |
documents = self.doc_prep(self.categories, []) | |
random.shuffle(documents) | |
words = [] | |
for d in documents: | |
words.extend(d[0].tokens) | |
all_words = nltk.FreqDist(w.lower() for w in words) | |
del words | |
self.word_features = all_words.keys()[:2000] | |
featuresets = [(self.document_features(d), c) for (d,c) in documents] | |
train_set, test_set = featuresets[500:], featuresets[:500] | |
classifier = nltk.NaiveBayesClassifier.train(train_set) | |
print nltk.classify.accuracy(classifier, test_set) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment