Last active
January 16, 2019 01:37
-
-
Save boates/5127281 to your computer and use it in GitHub Desktop.
For running sklearn classification algorithms easily on pandas data frame. Also perform tests on model accuracy.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def splitData(df, trainPerc=0.6, cvPerc=0.2, testPerc=0.2): | |
""" | |
return: training, cv, test | |
(as pandas dataframes) | |
params: | |
df: pandas dataframe | |
trainPerc: float | percentage of data for trainin set (default=0.6 | |
cvPerc: float | percentage of data for cross validation set (default=0.2) | |
testPerc: float | percentage of data for test set (default=0.2) | |
(trainPerc + cvPerc + testPerc must equal 1.0) | |
""" | |
assert trainPerc + cvPerc + testPerc == 1.0 | |
# create random list of indices | |
from random import shuffle | |
N = len(df) | |
l = range(N) | |
shuffle(l) | |
# get splitting indicies | |
trainLen = int(N*trainPerc) | |
cvLen = int(N*cvPerc) | |
testLen = int(N*testPerc) | |
# get training, cv, and test sets | |
training = df.ix[l[:trainLen]] | |
cv = df.ix[l[trainLen:trainLen+cvLen]] | |
test = df.ix[l[trainLen+cvLen:]] | |
#print len(cl), len(training), len(cv), len(test) | |
return training, cv, test | |
def getScore(df, classifier, classTitle, trainPerc, testPerc): | |
""" | |
return: float | accuracy score for classification model (e[0,1]) | |
params: | |
df: pandas dataframe | |
classifier: sklearn classifier | |
classTitle: string | title of class column in df | |
trainPerc: percentage of data to train on (default=0.80) | |
testPerc: percentage of data to test on (default=0.20) | |
(trainPerc + testPerc = 1.0) | |
""" | |
assert trainPerc + testPerc == 1.0 | |
# split the dataset | |
training, cv, test = splitData(df, trainPerc=trainPerc, cvPerc=0.00, testPerc=testPerc) | |
# get the features and classes | |
featureNames = [col for col in df.columns if col != classTitle] | |
trainFeatures = training[ featureNames ].values | |
trainClasses = training[ classTitle ].values | |
# create class dict to track numeric classes | |
classToString = {} | |
classToNumber = {} | |
for i, c in enumerate( sorted(set(trainClasses)) ): | |
classToString[i] = c | |
classToNumber[c] = i | |
# change classes to numbers (if not already) | |
trainClasses = [classToNumber[c] for c in trainClasses] | |
# fit the model | |
classifier.fit(trainFeatures, trainClasses) | |
# formt cross validation set | |
testFeatures = test[ featureNames ].values | |
testClasses = [classToNumber[c] for c in test[classTitle].values] | |
# compute the score on the test set | |
score = classifier.score(testFeatures, testClasses) | |
return score | |
def testModel(df, classifier, classTitle, N=1, trainPerc=0.80, testPerc=0.20): | |
""" | |
return: list[float] | list of scores for model (e[0,1]) | |
params: | |
df: pandas dataframe | |
classifier: sklearn classifier | |
classTitle: string | title of class column in df | |
N: int | number of tests to run (default=1) | |
trainPerc: percentage of data to train on (default=0.80) | |
testPerc: percentage of data to test on (default=0.20) | |
(trainPerc + testPerc = 1.0) | |
""" | |
# compute N scores | |
scores = [] | |
for i in range(N): | |
score = getScore(df=df, classifier=classifier, classTitle=classTitle, trainPerc=trainPerc, testPerc=testPerc) | |
scores.append(score) | |
return scores | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment