Skip to content

Instantly share code, notes, and snippets.

@samueljackson92
Last active June 9, 2016 15:37
Show Gist options
  • Save samueljackson92/3c8efbe30667fe57076ed57150adf39e to your computer and use it in GitHub Desktop.
Save samueljackson92/3c8efbe30667fe57076ed57150adf39e to your computer and use it in GitHub Desktop.
Repeated cross validation
from sklearn import cross_validation
from sklearn import metrics
from sklearn.pipeline import Pipeline
import numpy as np
import collections
def repeated_cross_fold_validation(models, X, y, n=10, k=5, cv=None,
score_function=metrics.accuracy_score):
""" Run cross validation on a set of models n times
All models are tested using the same cross validation splits
at each iteration.
Args:
models: List of pipelines or a single estimator
X: matrix for features
y: vector of labels
n: number of iterations to repeat cross validation (default 10)
k: number of folds to use at each iteration (default 5)
cv: crossvalidation object to use. If none, KFold will be used.
(default None)
score_func: score function to use. (default
sklearn.metrics.accuracy_score)
Returns:
Dictionary of n by k matricies for each model passed. If a single
estimator was passed then the dictionary will have a single entry named
'model'
"""
if not isinstance(models, collections.Iterable):
models = [Pipeline(steps=[('model', models)])]
# init dict to store results in
results = {model.steps[-1][0]: np.empty((n, k)) for model in models}
for i in range(n):
# create a new cross validation set for each iteration & test.
if cv is None:
cv = cross_validation.KFold(y.shape[0], n_folds=k)
# cross validate each of the models
for model in models:
model_name = model.steps[-1][0]
scores = cross_validation.cross_val_score(model, X, y=y, cv=cv)
results[model_name][i, :] = scores
return results
def monte_carlo_validation(models, X, y, n=10,
splitter_func=cross_validation.train_test_split,
score_func=metrics.accuracy_score):
""" Run Monte Carlo cross validation on a set of models n times.
This will randomly split the training and test data n times
and evaluate the performance of each model on each split.
Args:
models: List of pipelines or a single estimator
X: matrix for features
y: vector of labels
n: number of iterations to repeat cross validation (default 10)
splitter_func: A function that creates random training
test splits.
score_func: score function to use (default
sklearn.metrics.accuracy_score)
Returns:
Dictionary of arrays of size n for each model passed. If a single
estimator was passed then the dictionary will have a single entry named
'model'
"""
if not isinstance(models, collections.Iterable):
models = [Pipeline(steps=[('model', models)])]
# init dict to store results in
results = {model.steps[-1][0]: np.empty(n) for model in models}
for i in range(n):
x_train, x_valid, y_train, y_valid = splitter_func(X, y)
for model in models:
model_name = model.steps[-1][0]
model.fit(x_train, y_train)
if issubclass(score_func, metrics.scorer._ProbaScorer):
y_hat = model.predict_proba(x_valid)
else:
y_hat = model.predict(x_valid)
score = score_func(y_valid, y_hat)
results[model_name][i] = score
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment