dcollien · August 22, 2018 20:47
diff --git a/classify_text.py b/classify_text.py
 import nltk
 from summa.keywords import keywords

 def get_features(text):
    # get the top 80% of the phrases from the text, scored by relevance
    return dict(keywords(text, ratio=0.8, split=True, scores=True))

 def train_texts(classified_texts):
    # process the training set
    features = []
    for classification, text in classified_texts:
        features.append((get_features(text), classification))
    return nltk.NaiveBayesClassifier.train(features)

 def classify(classifier, text):
    # classify a document
    return classifier.classify(get_features(text))

 # Example:
 classifier = train_texts([
    ('spam', spam_text),
    ('ham', ham_text)
 ])

 is_spam = classify(classifier, ham_text) == 'spam'
	import nltk
	from summa.keywords import keywords

	def get_features(text):
	# get the top 80% of the phrases from the text, scored by relevance
	return dict(keywords(text, ratio=0.8, split=True, scores=True))

	def train_texts(classified_texts):
	# process the training set
	features = []
	for classification, text in classified_texts:
	features.append((get_features(text), classification))
	return nltk.NaiveBayesClassifier.train(features)

	def classify(classifier, text):
	# classify a document
	return classifier.classify(get_features(text))

	# Example:
	classifier = train_texts([
	('spam', spam_text),
	('ham', ham_text)
	])

	is_spam = classify(classifier, ham_text) == 'spam'