This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
df = pd.read_table("./model/data/smsdata.txt", sep="\t", names=["cat", "message"]) | |
pattern ='(?u)\\b[A-Za-z]{3,}' | |
tfidf = TfidfVectorizer(sublinear_tf=True, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from yhat import Yhat, BaseModel | |
df = pd.read_table("./model/data/smsdata.txt", sep="\t", names=["cat", "message"]) | |
pattern ='(?u)\\b[A-Za-z]{3,}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from yhat import Yhat | |
yh = Yhat("myusername", "7swYZ2XaiP35dzSUaas08chzsewSUaasma8BgyDFU1E") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
models = yh.show_models() | |
models = models['models'] | |
print models | |
# [{u'name': u'MySMSClassifier', u'username': u'myusername', u'version': 1}, | |
# {u'name': u'testModel', u'username': u'myusername', u'version': 1}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
text_message = "Free ticket giveaway! Limted time only!! 300 remaining call\ | |
or text (888) 555-1231 for info. Standard txt message rates apply (#12312)" | |
# yhat.predict takes | |
# <model name> and <version number> | |
v3 = 3 | |
print yh.predict('MySMSClassifier', v3, text_message) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ = require('jquery'); | |
serialize = function(obj) { | |
var str = []; | |
for(var p in obj) | |
str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p])); | |
return str.join("&"); | |
} | |
url = 'http://api.yhathq.com/predict?' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
# converts collection of raw documents to a matrix of TF-IDF features | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
# Naive Bayes works well for spam detection | |
from sklearn.naive_bayes import MultinomialNB |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_table("./model/data/smsdata.txt", | |
sep="\t", names=["cat", "message"]) | |
print " %d Not Spam" %len(df[df["cat"] == "ham"]) | |
print "+ %d Spam" %(len(df[df["cat"] == "spam"])) | |
print " ----" | |
print " %d Total" %(len(df)) | |
print "Proportion spam: %d/100" %(100*(len(df[df["cat"] == "spam"])) / float(len(df))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pattern ='(?u)\\b[A-Za-z]{3,}' | |
tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, | |
stop_words=None, token_pattern=pattern, ngram_range=(1, 3)) | |
#calculate features using tf-idf and create a training set | |
X_train = tfidf.fit_transform(df.message) | |
print "X_train is a sparse matrix with shape: %s" % str(X_train.shape) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#create a list of training labels. 1 is spam, 0 if ham | |
y_train = [0 if item=="ham" else 1 for item in df.cat] | |
print "y_train is a list of categories: %s ..." % str(y_train)[:70] | |
print "X_train has %d feature vectors" % (X_train.shape[0]) | |
print "y_train has %d target classes" %(len(y_train)) | |
print " df has %d rows" %(len(df)) | |
# create a Naive Bayes classifier |