Skip to content

Instantly share code, notes, and snippets.

@hopped
Last active August 29, 2015 14:01
Show Gist options
  • Save hopped/1cc2451d336e860d8456 to your computer and use it in GitHub Desktop.
Save hopped/1cc2451d336e860d8456 to your computer and use it in GitHub Desktop.
Filtering mobile spam messages with Naive Bayes (includes text mining transformations)
# Download data set via:
# http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
import numpy as np
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# dataset
df = pd.read_csv("SMSSpamCollection", sep="\t", names=["type", "text'"], header=0)
# define training and test set
classes = df.type
corpus = df.text
df_train = corpus.ix[0:4169]
df_test = corpus.ix[4170:]
target_train = classes.ix[0:4169]
target_test = classes.ix[4170:]
# some preprocessing via nltk
stemmer = PorterStemmer()
class SimpleTokenizer(object):
def __init__(self):
self.stemmer = PorterStemmer()
def __call__(self, doc):
return [self.stemmer.stem(tokens.lower()) for tokens in word_tokenize(doc)]
# compute vector model
vectorizer = CountVectorizer(tokenizer=SimpleTokenizer(), stop_words="english")
X_train = vectorizer.fit_transform(df_train)
X_test = vectorizer.transform(df_test)
y_train = np.asarray(target_train.values, dtype="|S6")
y_test = np.asarray(target_test.values, dtype="|S6")
lb = preprocessing.LabelBinarizer()
lb.fit(y_train)
lb.fit(y_test)
# classification
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
# metrics
confusion_matrix(y_test, pred)
accuracy_score(y_test, pred)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment