austin hernamesbarbara

209 followers · 112 following

Brooklyn, NY

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

hernamesbarbara / sample2.py

Created February 16, 2013 16:47

	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB

	df = pd.read_table("./model/data/smsdata.txt", sep="\t", names=["cat", "message"])

	pattern ='(?u)\\b[A-Za-z]{3,}'

	tfidf = TfidfVectorizer(sublinear_tf=True,

hernamesbarbara / sample3.py

Last active December 13, 2015 20:08

	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from yhat import Yhat, BaseModel

	df = pd.read_table("./model/data/smsdata.txt", sep="\t", names=["cat", "message"])

	pattern ='(?u)\\b[A-Za-z]{3,}'

hernamesbarbara / sample4.py

Created February 16, 2013 17:24

	from yhat import Yhat
	yh = Yhat("myusername", "7swYZ2XaiP35dzSUaas08chzsewSUaasma8BgyDFU1E")

hernamesbarbara / sample5.py

Last active December 13, 2015 20:08

	models = yh.show_models()
	models = models['models']
	print models

	# [{u'name': u'MySMSClassifier', u'username': u'myusername', u'version': 1},
	# {u'name': u'testModel', u'username': u'myusername', u'version': 1}]

hernamesbarbara / sample6.py

Last active December 13, 2015 20:08

	text_message = "Free ticket giveaway! Limted time only!! 300 remaining call\
	or text (888) 555-1231 for info. Standard txt message rates apply (#12312)"

	# yhat.predict takes
	# <model name> and <version number>
	v3 = 3
	print yh.predict('MySMSClassifier', v3, text_message)

hernamesbarbara / yhat_sms_predictor.js

Last active December 13, 2015 20:08

	$ = require('jquery');

	serialize = function(obj) {
	var str = [];
	for(var p in obj)
	str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p]));
	return str.join("&");
	}

	url = 'http://api.yhathq.com/predict?'

hernamesbarbara / sample7.py

Created February 16, 2013 19:07

	import numpy as np
	import pandas as pd

	# converts collection of raw documents to a matrix of TF-IDF features
	from sklearn.feature_extraction.text import TfidfVectorizer

	# Naive Bayes works well for spam detection
	from sklearn.naive_bayes import MultinomialNB

hernamesbarbara / sample8.py

Last active December 13, 2015 20:09

	df = pd.read_table("./model/data/smsdata.txt",
	sep="\t", names=["cat", "message"])

	print
	print " %d Not Spam" %len(df[df["cat"] == "ham"])
	print "+ %d Spam" %(len(df[df["cat"] == "spam"]))
	print " ----"
	print " %d Total" %(len(df))
	print
	print "Proportion spam: %d/100" %(100*(len(df[df["cat"] == "spam"])) / float(len(df)))

hernamesbarbara / sample9.py

Last active December 13, 2015 20:09

	pattern ='(?u)\\b[A-Za-z]{3,}'
	tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
	stop_words=None, token_pattern=pattern, ngram_range=(1, 3))

	#calculate features using tf-idf and create a training set
	X_train = tfidf.fit_transform(df.message)
	print
	print "X_train is a sparse matrix with shape: %s" % str(X_train.shape)
	print

hernamesbarbara / sample10.py

Created February 16, 2013 19:33

	#create a list of training labels. 1 is spam, 0 if ham
	y_train = [0 if item=="ham" else 1 for item in df.cat]

	print "y_train is a list of categories: %s ..." % str(y_train)[:70]
	print "X_train has %d feature vectors" % (X_train.shape[0])
	print "y_train has %d target classes" %(len(y_train))
	print " df has %d rows" %(len(df))
	print

	# create a Naive Bayes classifier