Last active
June 9, 2016 15:37
-
-
Save samueljackson92/3c8efbe30667fe57076ed57150adf39e to your computer and use it in GitHub Desktop.
Repeated cross validation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import cross_validation | |
from sklearn import metrics | |
from sklearn.pipeline import Pipeline | |
import numpy as np | |
import collections | |
def repeated_cross_fold_validation(models, X, y, n=10, k=5, cv=None, | |
score_function=metrics.accuracy_score): | |
""" Run cross validation on a set of models n times | |
All models are tested using the same cross validation splits | |
at each iteration. | |
Args: | |
models: List of pipelines or a single estimator | |
X: matrix for features | |
y: vector of labels | |
n: number of iterations to repeat cross validation (default 10) | |
k: number of folds to use at each iteration (default 5) | |
cv: crossvalidation object to use. If none, KFold will be used. | |
(default None) | |
score_func: score function to use. (default | |
sklearn.metrics.accuracy_score) | |
Returns: | |
Dictionary of n by k matricies for each model passed. If a single | |
estimator was passed then the dictionary will have a single entry named | |
'model' | |
""" | |
if not isinstance(models, collections.Iterable): | |
models = [Pipeline(steps=[('model', models)])] | |
# init dict to store results in | |
results = {model.steps[-1][0]: np.empty((n, k)) for model in models} | |
for i in range(n): | |
# create a new cross validation set for each iteration & test. | |
if cv is None: | |
cv = cross_validation.KFold(y.shape[0], n_folds=k) | |
# cross validate each of the models | |
for model in models: | |
model_name = model.steps[-1][0] | |
scores = cross_validation.cross_val_score(model, X, y=y, cv=cv) | |
results[model_name][i, :] = scores | |
return results | |
def monte_carlo_validation(models, X, y, n=10, | |
splitter_func=cross_validation.train_test_split, | |
score_func=metrics.accuracy_score): | |
""" Run Monte Carlo cross validation on a set of models n times. | |
This will randomly split the training and test data n times | |
and evaluate the performance of each model on each split. | |
Args: | |
models: List of pipelines or a single estimator | |
X: matrix for features | |
y: vector of labels | |
n: number of iterations to repeat cross validation (default 10) | |
splitter_func: A function that creates random training | |
test splits. | |
score_func: score function to use (default | |
sklearn.metrics.accuracy_score) | |
Returns: | |
Dictionary of arrays of size n for each model passed. If a single | |
estimator was passed then the dictionary will have a single entry named | |
'model' | |
""" | |
if not isinstance(models, collections.Iterable): | |
models = [Pipeline(steps=[('model', models)])] | |
# init dict to store results in | |
results = {model.steps[-1][0]: np.empty(n) for model in models} | |
for i in range(n): | |
x_train, x_valid, y_train, y_valid = splitter_func(X, y) | |
for model in models: | |
model_name = model.steps[-1][0] | |
model.fit(x_train, y_train) | |
if issubclass(score_func, metrics.scorer._ProbaScorer): | |
y_hat = model.predict_proba(x_valid) | |
else: | |
y_hat = model.predict(x_valid) | |
score = score_func(y_valid, y_hat) | |
results[model_name][i] = score | |
return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment