samueljackson92 · June 9, 2016 15:37
diff --git a/crossvalidation.py b/crossvalidation.py
 from sklearn import cross_validation
 from sklearn import metrics
 from sklearn.pipeline import Pipeline
 import numpy as np
 import collections


 def repeated_cross_fold_validation(models, X, y, n=10, k=5, cv=None,
                                   score_function=metrics.accuracy_score):
    """ Run cross validation on a set of models n times

    All models are tested using the same cross validation splits
    at each iteration.

    Args:
        models: List of pipelines or a single estimator
        X: matrix for features
        y: vector of labels
        n: number of iterations to repeat cross validation (default 10)
        k: number of folds to use at each iteration (default 5)
        cv: crossvalidation object to use. If none, KFold will be used.
        (default None)
        score_func: score function to use. (default
        sklearn.metrics.accuracy_score)
    Returns:
        Dictionary of n by k matricies for each model passed. If a single
        estimator was passed then the dictionary will have a single entry named
        'model'
    """
    if not isinstance(models, collections.Iterable):
        models = [Pipeline(steps=[('model', models)])]

    # init dict to store results in
    results = {model.steps[-1][0]: np.empty((n, k)) for model in models}

    for i in range(n):
        # create a new cross validation set for each iteration & test.
        if cv is None:
            cv = cross_validation.KFold(y.shape[0], n_folds=k)

        # cross validate each of the models
        for model in models:
            model_name = model.steps[-1][0]
            scores = cross_validation.cross_val_score(model, X, y=y, cv=cv)
            results[model_name][i, :] = scores

    return results


 def monte_carlo_validation(models, X, y, n=10,
                           splitter_func=cross_validation.train_test_split,
                           score_func=metrics.accuracy_score):
    """ Run Monte Carlo cross validation on a set of models n times.

    This will randomly split the training and test data n times
    and evaluate the performance of each model on each split.

    Args:
        models: List of pipelines or a single estimator
        X: matrix for features
        y: vector of labels
        n: number of iterations to repeat cross validation (default 10)
        splitter_func: A function that creates random training
        test splits.
        score_func: score function to use (default
        sklearn.metrics.accuracy_score)
    Returns:
        Dictionary of arrays of size n for each model passed. If a single
        estimator was passed then the dictionary will have a single entry named
        'model'
    """
    if not isinstance(models, collections.Iterable):
        models = [Pipeline(steps=[('model', models)])]

    # init dict to store results in
    results = {model.steps[-1][0]: np.empty(n) for model in models}

    for i in range(n):
        x_train, x_valid, y_train, y_valid = splitter_func(X, y)

        for model in models:
            model_name = model.steps[-1][0]

            model.fit(x_train, y_train)
            if issubclass(score_func, metrics.scorer._ProbaScorer):
                y_hat = model.predict_proba(x_valid)
            else:
                y_hat = model.predict(x_valid)

            score = score_func(y_valid, y_hat)
            results[model_name][i] = score

    return results
	from sklearn import cross_validation
	from sklearn import metrics
	from sklearn.pipeline import Pipeline
	import numpy as np
	import collections


	def repeated_cross_fold_validation(models, X, y, n=10, k=5, cv=None,
	score_function=metrics.accuracy_score):
	""" Run cross validation on a set of models n times

	All models are tested using the same cross validation splits
	at each iteration.

	Args:
	models: List of pipelines or a single estimator
	X: matrix for features
	y: vector of labels
	n: number of iterations to repeat cross validation (default 10)
	k: number of folds to use at each iteration (default 5)
	cv: crossvalidation object to use. If none, KFold will be used.
	(default None)
	score_func: score function to use. (default
	sklearn.metrics.accuracy_score)
	Returns:
	Dictionary of n by k matricies for each model passed. If a single
	estimator was passed then the dictionary will have a single entry named
	'model'
	"""
	if not isinstance(models, collections.Iterable):
	models = [Pipeline(steps=[('model', models)])]

	# init dict to store results in
	results = {model.steps[-1][0]: np.empty((n, k)) for model in models}

	for i in range(n):
	# create a new cross validation set for each iteration & test.
	if cv is None:
	cv = cross_validation.KFold(y.shape[0], n_folds=k)

	# cross validate each of the models
	for model in models:
	model_name = model.steps[-1][0]
	scores = cross_validation.cross_val_score(model, X, y=y, cv=cv)
	results[model_name][i, :] = scores

	return results


	def monte_carlo_validation(models, X, y, n=10,
	splitter_func=cross_validation.train_test_split,
	score_func=metrics.accuracy_score):
	""" Run Monte Carlo cross validation on a set of models n times.

	This will randomly split the training and test data n times
	and evaluate the performance of each model on each split.

	Args:
	models: List of pipelines or a single estimator
	X: matrix for features
	y: vector of labels
	n: number of iterations to repeat cross validation (default 10)
	splitter_func: A function that creates random training
	test splits.
	score_func: score function to use (default
	sklearn.metrics.accuracy_score)
	Returns:
	Dictionary of arrays of size n for each model passed. If a single
	estimator was passed then the dictionary will have a single entry named
	'model'
	"""
	if not isinstance(models, collections.Iterable):
	models = [Pipeline(steps=[('model', models)])]

	# init dict to store results in
	results = {model.steps[-1][0]: np.empty(n) for model in models}

	for i in range(n):
	x_train, x_valid, y_train, y_valid = splitter_func(X, y)

	for model in models:
	model_name = model.steps[-1][0]

	model.fit(x_train, y_train)
	if issubclass(score_func, metrics.scorer._ProbaScorer):
	y_hat = model.predict_proba(x_valid)
	else:
	y_hat = model.predict(x_valid)

	score = score_func(y_valid, y_hat)
	results[model_name][i] = score

	return results