Skip to content

Instantly share code, notes, and snippets.

View hernamesbarbara's full-sized avatar

austin hernamesbarbara

View GitHub Profile
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
df = pd.read_table("./model/data/smsdata.txt", sep="\t", names=["cat", "message"])
pattern ='(?u)\\b[A-Za-z]{3,}'
tfidf = TfidfVectorizer(sublinear_tf=True,
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from yhat import Yhat, BaseModel
df = pd.read_table("./model/data/smsdata.txt", sep="\t", names=["cat", "message"])
pattern ='(?u)\\b[A-Za-z]{3,}'
from yhat import Yhat
yh = Yhat("myusername", "7swYZ2XaiP35dzSUaas08chzsewSUaasma8BgyDFU1E")
models = yh.show_models()
models = models['models']
print models
# [{u'name': u'MySMSClassifier', u'username': u'myusername', u'version': 1},
# {u'name': u'testModel', u'username': u'myusername', u'version': 1}]
text_message = "Free ticket giveaway! Limted time only!! 300 remaining call\
or text (888) 555-1231 for info. Standard txt message rates apply (#12312)"
# yhat.predict takes
# <model name> and <version number>
v3 = 3
print yh.predict('MySMSClassifier', v3, text_message)
$ = require('jquery');
serialize = function(obj) {
var str = [];
for(var p in obj)
str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p]));
return str.join("&");
}
url = 'http://api.yhathq.com/predict?'
import numpy as np
import pandas as pd
# converts collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
# Naive Bayes works well for spam detection
from sklearn.naive_bayes import MultinomialNB
df = pd.read_table("./model/data/smsdata.txt",
sep="\t", names=["cat", "message"])
print
print " %d Not Spam" %len(df[df["cat"] == "ham"])
print "+ %d Spam" %(len(df[df["cat"] == "spam"]))
print " ----"
print " %d Total" %(len(df))
print
print "Proportion spam: %d/100" %(100*(len(df[df["cat"] == "spam"])) / float(len(df)))
pattern ='(?u)\\b[A-Za-z]{3,}'
tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words=None, token_pattern=pattern, ngram_range=(1, 3))
#calculate features using tf-idf and create a training set
X_train = tfidf.fit_transform(df.message)
print
print "X_train is a sparse matrix with shape: %s" % str(X_train.shape)
print
#create a list of training labels. 1 is spam, 0 if ham
y_train = [0 if item=="ham" else 1 for item in df.cat]
print "y_train is a list of categories: %s ..." % str(y_train)[:70]
print "X_train has %d feature vectors" % (X_train.shape[0])
print "y_train has %d target classes" %(len(y_train))
print " df has %d rows" %(len(df))
print
# create a Naive Bayes classifier