viveksck · November 22, 2013 04:19
diff --git a/learning_curves.png b/learning_curves.png
diff --git a/learning_curves.py b/learning_curves.py
 """Plot error curves for various dataset sizes

 This plot emphasizes the fact that adding more data might help: by
 following the trend, one can expect test score to further increase and
 help reduce the overfit a bit as the slope of the test score curve is
 still not null on the right hand side of the plot, especially when the prior /
 regularizer parameter alpha is low.

 Furthermore as the train error seems to remain pretty close to the perfect
 score when the dataset size grows one can estimate that the model does not
 underfit too much (is not too simplistic / biased).

 See the following blog post for more details on how to interpret them:

  http://digitheadslabnotebook.blogspot.com/2011/12/practical-advice-for-applying-machine.html

 """
 import numpy as np
 from time import time

 from sklearn.cross_validation import ShuffleSplit
 from sklearn.datasets import fetch_20newsgroups_vectorized
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.base import clone


 def score_curves(clf_orig, X, y, n_runs=5, test_fraction=0.1,
                 train_fraction_range=np.linspace(0.1, 0.9, 10)):

    n_datasets = train_fraction_range.shape[0]
    training_score = np.zeros((n_datasets, n_runs))
    test_score = np.zeros((n_datasets, n_runs))
    training_time = np.zeros((n_datasets, n_runs))

    for i, train_fraction in enumerate(train_fraction_range):
        print "Train fraction: %0.2f" % train_fraction

        cv = ShuffleSplit(n_samples, n_iterations=n_runs,
                          test_fraction=test_fraction,
                          train_fraction=train_fraction)
        for j, (train, test) in enumerate(cv):
            clf = clone(clf_orig)
            t0 = time()
            clf.fit(X[train], y[train])
            training_time[i, j] = time() - t0
            training_score[i, j] = clf.score(X[train], y[train])
            test_score[i, j] = clf.score(X[test], y[test])

    return training_score, test_score, training_time


 if __name__ == "__main__":
    data = fetch_20newsgroups_vectorized(subset='all')
    categories = data.target_names
    X = data.data
    y = data.target
    n_samples = y.shape[0]
    n_features = X.shape[1]
    print "n_samples: %d, n_features: %d" % (n_samples, n_features)

    clf = MultinomialNB(alpha=.01)
    train_fraction_range = np.linspace(0.1, 0.9, 5)
    test_fraction = 0.1

    train_score, test_score, training_time = score_curves(
        clf, X, y, train_fraction_range=train_fraction_range,
        test_fraction=test_fraction)

    print clf
    mean_test_score = test_score.mean(axis=1)
    mean_train_score = train_score.mean(axis=1)
    gap = np.abs(mean_test_score - mean_train_score)
    print "Best test score: %0.2f" % mean_test_score.max()
    print "Gap at train_fraction=%0.2f: %0.2f" % (
        train_fraction_range[-1], gap[-1])

    import pylab as pl
    plots = []
    plots.append(pl.errorbar(train_fraction_range,
                             mean_train_score,
                             train_score.std(axis=1)))
    plots.append(pl.errorbar(train_fraction_range,
                             mean_test_score,
                             test_score.std(axis=1)))

    pl.legend(plots, ('train', 'test'), loc='lower right')

    pl.title("Learning curves for %r\n"
             "Best test score: %0.2f - Gap: %0.2f" %
             (clf, mean_test_score.max(), gap[-1]))
    pl.ylim(0.0, 1.0)
    pl.ylabel('Classification score')
    pl.xlabel('Fraction of the dataset used for training\n'
              'The test fraction is fixed to %0.2f' % test_fraction)
    pl.show()
diff --git a/learning_curves_biased.png b/learning_curves_biased.png
diff --git a/learning_curves_overfit.png b/learning_curves_overfit.png
	"""Plot error curves for various dataset sizes

	This plot emphasizes the fact that adding more data might help: by
	following the trend, one can expect test score to further increase and
	help reduce the overfit a bit as the slope of the test score curve is
	still not null on the right hand side of the plot, especially when the prior /
	regularizer parameter alpha is low.

	Furthermore as the train error seems to remain pretty close to the perfect
	score when the dataset size grows one can estimate that the model does not
	underfit too much (is not too simplistic / biased).

	See the following blog post for more details on how to interpret them:

	http://digitheadslabnotebook.blogspot.com/2011/12/practical-advice-for-applying-machine.html

	"""
	import numpy as np
	from time import time

	from sklearn.cross_validation import ShuffleSplit
	from sklearn.datasets import fetch_20newsgroups_vectorized
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.base import clone


	def score_curves(clf_orig, X, y, n_runs=5, test_fraction=0.1,
	train_fraction_range=np.linspace(0.1, 0.9, 10)):

	n_datasets = train_fraction_range.shape[0]
	training_score = np.zeros((n_datasets, n_runs))
	test_score = np.zeros((n_datasets, n_runs))
	training_time = np.zeros((n_datasets, n_runs))

	for i, train_fraction in enumerate(train_fraction_range):
	print "Train fraction: %0.2f" % train_fraction

	cv = ShuffleSplit(n_samples, n_iterations=n_runs,
	test_fraction=test_fraction,
	train_fraction=train_fraction)
	for j, (train, test) in enumerate(cv):
	clf = clone(clf_orig)
	t0 = time()
	clf.fit(X[train], y[train])
	training_time[i, j] = time() - t0
	training_score[i, j] = clf.score(X[train], y[train])
	test_score[i, j] = clf.score(X[test], y[test])

	return training_score, test_score, training_time


	if __name__ == "__main__":
	data = fetch_20newsgroups_vectorized(subset='all')
	categories = data.target_names
	X = data.data
	y = data.target
	n_samples = y.shape[0]
	n_features = X.shape[1]
	print "n_samples: %d, n_features: %d" % (n_samples, n_features)

	clf = MultinomialNB(alpha=.01)
	train_fraction_range = np.linspace(0.1, 0.9, 5)
	test_fraction = 0.1

	train_score, test_score, training_time = score_curves(
	clf, X, y, train_fraction_range=train_fraction_range,
	test_fraction=test_fraction)

	print clf
	mean_test_score = test_score.mean(axis=1)
	mean_train_score = train_score.mean(axis=1)
	gap = np.abs(mean_test_score - mean_train_score)
	print "Best test score: %0.2f" % mean_test_score.max()
	print "Gap at train_fraction=%0.2f: %0.2f" % (
	train_fraction_range[-1], gap[-1])

	import pylab as pl
	plots = []
	plots.append(pl.errorbar(train_fraction_range,
	mean_train_score,
	train_score.std(axis=1)))
	plots.append(pl.errorbar(train_fraction_range,
	mean_test_score,
	test_score.std(axis=1)))

	pl.legend(plots, ('train', 'test'), loc='lower right')

	pl.title("Learning curves for %r\n"
	"Best test score: %0.2f - Gap: %0.2f" %
	(clf, mean_test_score.max(), gap[-1]))
	pl.ylim(0.0, 1.0)
	pl.ylabel('Classification score')
	pl.xlabel('Fraction of the dataset used for training\n'
	'The test fraction is fixed to %0.2f' % test_fraction)
	pl.show()