glennq · December 27, 2017 03:30
diff --git a/adam_lbfgs_compare.py b/adam_lbfgs_compare.py
 import numpy as np
 from time import time
 import matplotlib.pyplot as plt
 from matplotlib.colors import ListedColormap
 from sklearn.cross_validation import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import make_moons, make_circles, make_classification
 from sklearn.neural_network import MLPClassifier


 h = .02  # step size in the mesh

 ESTIMATORS = {
    'Adam': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1),
    'Adam_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1, early_stopping=True),
    'l-bfgs': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='l-bfgs', learning_rate_init=0.01, verbose=1,
        tol=1e-4, random_state=1, early_stopping=False),

 }

 names = ESTIMATORS.keys()
 classifiers = ESTIMATORS.values()


 def make_datasets(n_samples=100):
    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                               random_state=1, n_clusters_per_class=1,
                               n_samples=n_samples)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [make_moons(noise=0.3, random_state=0, n_samples=n_samples),
                make_circles(noise=0.2, factor=0.5, random_state=1,
                             n_samples=n_samples),
                linearly_separable]
    return datasets


 figure = plt.figure(figsize=(27, 9))
 i = 0
 # iterate over datasets
 sample_sizes = range(100, 1000, 400)
 datasets = []
 for n_samples in sample_sizes:
    datasets += make_datasets(n_samples)

 for j, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(classifiers) + 1, len(datasets),
                     i % (len(classifiers) + 1) * len(datasets) + j + 1)

    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
               alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(str(len(y)), fontsize=10)
    i += 1

    # iterate over classifiers
    cnt = 0
    for name, clf in zip(names, classifiers):
        cnt += 1
        ax = plt.subplot(len(classifiers) + 1, len(datasets),
                         i % (len(classifiers) + 1) * len(datasets) + j + 1)
        time_start = time()
        clf.fit(X_train, y_train)
        train_time = time() - time_start
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to
        # each point in the mesh [x_min, m_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(name, fontsize=10)

        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        ax.text(xx.min() + .3, yy.min() + .3,
                ('%.3f' % train_time).lstrip('0'),
                size=15, horizontalalignment='left')

        i += 1

 figure.subplots_adjust(left=.02, right=.98)
 plt.show()
diff --git a/bench_adam_lbfgs_boston.py b/bench_adam_lbfgs_boston.py
 """
 Benchmarking adam and lbfgs on Boston dataset

 Regression performance:
 ===========================
 Regressor               train-time   test-time   test-score
 ----------------------------------------------------------------------------
 adam                         0.3896s     0.0003s       0.8606
 l-bfgs                       0.5861s     0.0003s       0.8689
 adam-early                   0.6177s     0.0004s       0.8750
 """
 from __future__ import print_function
 import numpy as np
 from time import time
 import argparse
 from sklearn import datasets
 from sklearn.preprocessing import StandardScaler
 from sklearn.cross_validation import train_test_split
 from sklearn.neural_network import MLPRegressor


 # import some data to play with
 def load_data():
    dataset = datasets.load_boston()
    X = dataset.data  # we only take the first two features.
    X = StandardScaler().fit_transform(X)
    y = dataset.target
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,
                                                        random_state=1)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    return train_X, test_X, train_y, test_y


 ESTIMATORS = {'adam': MLPRegressor(random_state=1,
                                   hidden_layer_sizes=(100, 100)),
              'adam-early': MLPRegressor(random_state=1, early_stopping=True,
                                         hidden_layer_sizes=(100, 100)),
              'l-bfgs': MLPRegressor(algorithm='l-bfgs', random_state=1,
                                     hidden_layer_sizes=(100, 100))}


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--estimators', nargs="+",
                        choices=ESTIMATORS.keys() + ['all'], type=str,
                        default=['adam', 'adam-early', 'l-bfgs'],
                        help="list of classifiers to benchmark.")
    parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
                        help="Number of concurrently running workers for "
                             "models that support parallelism.")
    parser.add_argument('--random-seed', nargs="?", default=0, type=int,
                        help="Common seed used by random number generator.")
    args = vars(parser.parse_args())

    print(__doc__)

    X_train, X_test, y_train, y_test = load_data()

    print("")
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
    print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
                                 X_train.shape[0], int(X_train.nbytes / 1e6)))
    print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
                                 X_test.shape[0], int(X_test.nbytes / 1e6)))

    print()
    print("Training Estimators")
    print("====================")
    error, train_time, test_time = {}, {}, {}
    if 'all' in args['estimators']:
        args['estimators'] = ESTIMATORS.keys()
    for name in sorted(args["estimators"]):
        print("Training %s ... " % name, end="")
        estimator = ESTIMATORS[name]
        estimator_params = estimator.get_params()

        estimator.set_params(**{p: args["random_seed"]
                                for p in estimator_params
                                if p.endswith("random_state")})

        if "n_jobs" in estimator_params:
            estimator.set_params(n_jobs=args["n_jobs"])

        time_start = time()
        estimator.fit(X_train, y_train)
        train_time[name] = time() - time_start

        time_start = time()
        y_pred = estimator.predict(X_test)
        test_time[name] = time() - time_start

        error[name] = estimator.score(X_test, y_test)

        print("done")

    print()
    print("Regression performance:")
    print("===========================")
    print("{0: <23} {1: >10} {2: >11} {3: >12}"
          "".format("Regressor  ", "train-time", "test-time",
                    "test-score"))
    print("-" * 76)
    for name in sorted(args["estimators"], key=error.get):

        print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
              "".format(name, train_time[name], test_time[name], error[name]))

    print()
diff --git a/bench_adam_lbfgs_diabetes.py b/bench_adam_lbfgs_diabetes.py
 """
 Benchmarking adam and lbfgs on Diabetes dataset

 Regression performance:
 ===========================
 Regressor               train-time   test-time   test-score
 ----------------------------------------------------------------------------
 adam-early                   0.3612s     0.0002s       0.2961
 adam                         0.4856s     0.0003s       0.3538
 l-bfgs                       0.4855s     0.0003s       0.4170
 """
 from __future__ import print_function
 import numpy as np
 from time import time
 import argparse
 from sklearn import datasets
 from sklearn.cross_validation import train_test_split
 from sklearn.neural_network import MLPRegressor


 # import some data to play with
 def load_data():
    iris = datasets.load_diabetes()
    X = iris.data  # we only take the first two features.
    y = iris.target
    return train_test_split(X, y, test_size=0.2, random_state=1)


 ESTIMATORS = {'adam': MLPRegressor(random_state=1,
                                   hidden_layer_sizes=(100, 100)),
              'adam-early': MLPRegressor(random_state=1, early_stopping=True,
                                         hidden_layer_sizes=(100, 100)),
              'l-bfgs': MLPRegressor(algorithm='l-bfgs', random_state=1,
                                     hidden_layer_sizes=(100, 100))}


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--estimators', nargs="+",
                        choices=ESTIMATORS.keys() + ['all'], type=str,
                        default=['adam', 'adam-early', 'l-bfgs'],
                        help="list of classifiers to benchmark.")
    parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
                        help="Number of concurrently running workers for "
                             "models that support parallelism.")
    parser.add_argument('--random-seed', nargs="?", default=0, type=int,
                        help="Common seed used by random number generator.")
    args = vars(parser.parse_args())

    print(__doc__)

    X_train, X_test, y_train, y_test = load_data()

    print("")
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
    print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
                                 X_train.shape[0], int(X_train.nbytes / 1e6)))
    print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
                                 X_test.shape[0], int(X_test.nbytes / 1e6)))

    print()
    print("Training Estimators")
    print("====================")
    error, train_time, test_time = {}, {}, {}
    if 'all' in args['estimators']:
        args['estimators'] = ESTIMATORS.keys()
    for name in sorted(args["estimators"]):
        print("Training %s ... " % name, end="")
        estimator = ESTIMATORS[name]
        estimator_params = estimator.get_params()

        estimator.set_params(**{p: args["random_seed"]
                                for p in estimator_params
                                if p.endswith("random_state")})

        if "n_jobs" in estimator_params:
            estimator.set_params(n_jobs=args["n_jobs"])

        time_start = time()
        estimator.fit(X_train, y_train)
        train_time[name] = time() - time_start

        time_start = time()
        y_pred = estimator.predict(X_test)
        test_time[name] = time() - time_start

        error[name] = estimator.score(X_test, y_test)

        print("done")

    print()
    print("Regression performance:")
    print("===========================")
    print("{0: <23} {1: >10} {2: >11} {3: >12}"
          "".format("Regressor  ", "train-time", "test-time",
                    "test-score"))
    print("-" * 76)
    for name in sorted(args["estimators"], key=error.get):

        print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
              "".format(name, train_time[name], test_time[name], error[name]))

    print()
diff --git a/bench_adam_lbfgs_digits.py b/bench_adam_lbfgs_digits.py
 """
 Classification performance:
 ===========================
 Classifier              train-time   test-time   error-rate
 ----------------------------------------------------------------------------
 adam                         1.1049s     0.0010s       0.0167
 l-bfgs                       0.0910s     0.0008s       0.0306
 adam-early                   0.1354s     0.0009s       0.0528
 """
 from __future__ import print_function
 import numpy as np
 from time import time
 import argparse
 from sklearn import datasets
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import zero_one_loss
 from sklearn.cross_validation import train_test_split
 from sklearn.neural_network import MLPClassifier


 def load_data():
    dataset = datasets.load_digits()
    X = dataset.data  # we only take the first two features.
    X = StandardScaler().fit_transform(X)
    y = dataset.target
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,
                                                        random_state=1)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    return train_X, test_X, train_y, test_y


 ESTIMATORS = {'adam': MLPClassifier(random_state=1),
              'adam-early': MLPClassifier(random_state=1, early_stopping=True),
              'l-bfgs': MLPClassifier(algorithm='l-bfgs', random_state=1)}


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--classifiers', nargs="+",
                        choices=ESTIMATORS.keys() + ['all'], type=str,
                        default=['adam', 'adam-early', 'l-bfgs'],
                        help="list of classifiers to benchmark.")
    parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
                        help="Number of concurrently running workers for "
                             "models that support parallelism.")
    parser.add_argument('--random-seed', nargs="?", default=0, type=int,
                        help="Common seed used by random number generator.")
    args = vars(parser.parse_args())

    print(__doc__)

    X_train, X_test, y_train, y_test = load_data()

    print("")
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
    print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
                                 X_train.shape[0], int(X_train.nbytes / 1e6)))
    print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
                                 X_test.shape[0], int(X_test.nbytes / 1e6)))

    print()
    print("Training Classifiers")
    print("====================")
    error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
    if 'all' in args['classifiers']:
        args['classifiers'] = ESTIMATORS.keys()
    for name in sorted(args["classifiers"]):
        print("Training %s ... " % name, end="")
        estimator = ESTIMATORS[name]
        estimator_params = estimator.get_params()

        estimator.set_params(**{p: args["random_seed"]
                                for p in estimator_params
                                if p.endswith("random_state")})

        if "n_jobs" in estimator_params:
            estimator.set_params(n_jobs=args["n_jobs"])

        time_start = time()
        estimator.fit(X_train, y_train)
        train_time[name] = time() - time_start

        time_start = time()
        y_pred = estimator.predict(X_test)
        test_time[name] = time() - time_start

        error[name] = zero_one_loss(y_test, y_pred)

        print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print("{0: <23} {1: >10} {2: >11} {3: >12}"
          "".format("Classifier  ", "train-time", "test-time", "error-rate"))
    print("-" * 76)
    for name in sorted(args["classifiers"], key=error.get):

        print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
              "".format(name, train_time[name], test_time[name], error[name]))

    print()
diff --git a/bench_adam_lbfgs_iris.py b/bench_adam_lbfgs_iris.py
 """
 Classification performance:
 ===========================
 Classifier              train-time   test-time   error-rate
 ----------------------------------------------------------------------------
 adam                         0.1003s     0.0002s       0.0333
 l-bfgs                       0.0344s     0.0001s       0.0333
 adam-early                   0.0083s     0.0001s       0.5333
 """
 from __future__ import print_function
 import numpy as np
 from time import time
 import argparse
 from sklearn import datasets
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import zero_one_loss
 from sklearn.cross_validation import train_test_split
 from sklearn.neural_network import MLPClassifier


 def load_data():
    dataset = datasets.load_iris()
    X = dataset.data  # we only take the first two features.
    X = StandardScaler().fit_transform(X)
    y = dataset.target
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,
                                                        random_state=1)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    return train_X, test_X, train_y, test_y


 ESTIMATORS = {'adam': MLPClassifier(random_state=1),
              'adam-early': MLPClassifier(random_state=1, early_stopping=True),
              'l-bfgs': MLPClassifier(algorithm='l-bfgs', random_state=1)}


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--classifiers', nargs="+",
                        choices=ESTIMATORS.keys() + ['all'], type=str,
                        default=['adam', 'adam-early', 'l-bfgs'],
                        help="list of classifiers to benchmark.")
    parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
                        help="Number of concurrently running workers for "
                             "models that support parallelism.")
    parser.add_argument('--random-seed', nargs="?", default=0, type=int,
                        help="Common seed used by random number generator.")
    args = vars(parser.parse_args())

    print(__doc__)

    X_train, X_test, y_train, y_test = load_data()

    print("")
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
    print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
                                 X_train.shape[0], int(X_train.nbytes / 1e6)))
    print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
                                 X_test.shape[0], int(X_test.nbytes / 1e6)))

    print()
    print("Training Classifiers")
    print("====================")
    error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
    if 'all' in args['classifiers']:
        args['classifiers'] = ESTIMATORS.keys()
    for name in sorted(args["classifiers"]):
        print("Training %s ... " % name, end="")
        estimator = ESTIMATORS[name]
        estimator_params = estimator.get_params()

        estimator.set_params(**{p: args["random_seed"]
                                for p in estimator_params
                                if p.endswith("random_state")})

        if "n_jobs" in estimator_params:
            estimator.set_params(n_jobs=args["n_jobs"])

        time_start = time()
        estimator.fit(X_train, y_train)
        train_time[name] = time() - time_start

        time_start = time()
        y_pred = estimator.predict(X_test)
        test_time[name] = time() - time_start

        error[name] = zero_one_loss(y_test, y_pred)

        print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print("{0: <23} {1: >10} {2: >11} {3: >12}"
          "".format("Classifier  ", "train-time", "test-time", "error-rate"))
    print("-" * 76)
    for name in sorted(args["classifiers"], key=error.get):

        print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
              "".format(name, train_time[name], test_time[name], error[name]))

    print()
diff --git a/bench_mlp_20newsgroups.py b/bench_mlp_20newsgroups.py
 """
 Benchmarking MLP Performances on 20NewGroup dataset

 Classification performance:
 ===========================

 Classifier   train-time test-time Accuracy
 -------------------------------------------------------------------
 MLP_SGD_constant_no_momentum_early    61.9779s   0.1686s     0.0333
 MLP_SGD_invscaling_nesterov          143.6211s   0.1713s     0.0506
 MLP_SGD_invscaling_nesterov_early    212.4194s   0.1691s     0.0769
 MLP_SGD_constant_no_momentum         6882.5231s  0.1800s     0.5842
 MLP_SGD_constant_nesterov_early      1871.2481s  0.1793s     0.7302
 MLP_SGD_adaptive_nesterov_early      2395.0412s  0.1822s     0.7382
 MLP_SGD_constant_nesterov            5725.5833s  0.1799s     0.7649
 MLP_SGD_adaptive_nesterov            6263.0604s  0.1733s     0.7678
 MLP_SGD_constant_momentum            4174.8977s  0.1666s     0.7678
 MLP_Adam                             1395.2267s  0.1822s     0.8314
 MLP_Adam_early                       528.2558s   0.1718s     0.8330

 with learning_rate_init=0.1 for sgd:
 Classification performance:
 ===========================

 Classifier   train-time test-time Accuracy
 -------------------------------------------------------------------
 MLP_SGD_invscaling_nesterov_early     96.1587s   0.1757s     0.1032
 MLP_SGD_invscaling_nesterov          11964.9086s  0.2026s     0.1374
 MLP_SGD_constant_no_momentum_early   326.6792s   0.2224s     0.1620
 MLP_SGD_constant_nesterov_early      407.4799s   0.2217s     0.6190
 MLP_SGD_adaptive_nesterov_early      2152.5739s  0.2308s     0.7338
 MLP_SGD_constant_momentum            1169.3004s  0.2609s     0.7617
 MLP_SGD_constant_nesterov            1634.3541s  0.2659s     0.7681
 MLP_SGD_constant_no_momentum         5292.0535s  0.2432s     0.7747
 MLP_SGD_adaptive_nesterov            2166.2128s  0.2393s     0.7781
 MLP_Adam                             1740.3218s  0.2305s     0.8314
 MLP_Adam_early                       636.6075s   0.1822s     0.8330
 """

 from __future__ import print_function, division
 from time import time
 import cPickle as pickle
 import argparse
 import numpy as np
 import matplotlib.pyplot as plt

 from sklearn.datasets import fetch_20newsgroups_vectorized
 from sklearn.metrics import accuracy_score
 from sklearn.utils.validation import check_array

 from sklearn.neural_network import MLPClassifier


 def make_plots(loss, val_loss):
    non_early = [name for name in loss if not name.endswith('_early')]
    early = [name for name in loss if name.endswith('_early')]

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    make_sub_plot({name: loss[name] for name in non_early}, axes.ravel()[0])
    for name, ax in zip(early, axes.ravel()[1:]):
        make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name],
                       name + '_val': val_loss[name]}, ax)

    plt.subplots_adjust(hspace=0.45)
    plt.subplots_adjust(top=0.8)
    plt.show()


 def make_sub_plot(loss, ax):
    plot_args = [{'c': 'red', 'linestyle': '-'},
                 {'c': 'green', 'linestyle': '-'},
                 {'c': 'blue', 'linestyle': '-'},
                 {'c': 'red', 'linestyle': '--'},
                 {'c': 'green', 'linestyle': '--'},
                 {'c': 'blue', 'linestyle': '--'}]
    for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args):
        ax.plot(loss_curve, label=label, **args)
    if len(loss) > 3:
        ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
                  bbox_to_anchor=(0.95, 1.30), fontsize=11)
    else:
        ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
                  bbox_to_anchor=(1.05, 1.20), fontsize=11)


 ESTIMATORS = {
    'MLP_SGD_constant_no_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'MLP_SGD_constant_no_momentum_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False,
        early_stopping=True),
    'MLP_SGD_constant_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'MLP_SGD_constant_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
    'MLP_SGD_constant_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        early_stopping=True),
    'MLP_SGD_invscaling_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling'),
    'MLP_SGD_invscaling_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling', early_stopping=True),
    'MLP_SGD_adaptive_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive'),
    'MLP_SGD_adaptive_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive', early_stopping=True),
    'MLP_Adam': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1),
    'MLP_Adam_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1, early_stopping=True),
 }


 ###############################################################################
 # Data

 if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--estimators', nargs="+", required=True,
                        choices=ESTIMATORS.keys() + ['all'])
    args = vars(parser.parse_args())

    data_train = fetch_20newsgroups_vectorized(subset="train")
    data_test = fetch_20newsgroups_vectorized(subset="test")
    X_train = check_array(data_train.data, dtype=np.float32,
                          accept_sparse="csc")
    X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
    y_train = data_train.target
    y_test = data_test.target

    print("20 newsgroups")
    print("=============")
    print("X_train.shape = {0}".format(X_train.shape))
    print("X_train.format = {0}".format(X_train.format))
    print("X_train.dtype = {0}".format(X_train.dtype))
    print("X_train density = {0}"
          "".format(X_train.nnz / np.product(X_train.shape)))
    print("y_train {0}".format(y_train.shape))
    print("X_test {0}".format(X_test.shape))
    print("X_test.format = {0}".format(X_test.format))
    print("X_test.dtype = {0}".format(X_test.dtype))
    print("y_test {0}".format(y_test.shape))
    print()

    print("Classifier Training")
    print("===================")
    accuracy, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
    if 'all' in args['estimators']:
        args['estimators'] = ESTIMATORS.keys()

    for name in sorted(args["estimators"]):
        clf = ESTIMATORS[name]
        try:
            clf.set_params(random_state=0)
        except (TypeError, ValueError):
            pass

        print("Training %s ... " % name, end="")
        t0 = time()
        clf.fit(X_train, y_train)
        train_time[name] = time() - t0
        t0 = time()
        y_pred = clf.predict(X_test)
        test_time[name] = time() - t0
        accuracy[name] = accuracy_score(y_test, y_pred)
        loss_curve[name] = clf.loss_curve_
        val_curve[name] = getattr(clf, 'validation_scores_', [])

        print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print()
    print("%s %s %s %s" % ("Classifier  ", "train-time", "test-time",
                           "Accuracy"))
    print("-" * 67)
    for name in sorted(accuracy, key=accuracy.get):
        print("%s %s %s %s" % (name.ljust(36),
                               ("%.4fs" % train_time[name]).center(10),
                               ("%.4fs" % test_time[name]).center(10),
                               ("%.4f" % accuracy[name]).center(10)))

    print()

    with open('loss_history_20news.pkl', 'wb') as f:
        pickle.dump(loss_curve, f)
    with open('val_loss_history_20news.pkl', 'wb') as f:
        pickle.dump(val_curve, f)

    make_plots(loss_curve, val_curve)
diff --git a/bench_mlp_mnist.py b/bench_mlp_mnist.py
 """
 Benchmarking MLP performance on MNIST dataset

 Classification performance:
 ===========================
 Classifier                              train-time   test-time   error-rate
 ----------------------------------------------------------------------------
 MLP_SGD_constant_momentum                    105.07s       0.10s       0.0205
 MLP_SGD_adaptive_nesterov                    166.20s       0.10s       0.0213
 MLP_SGD_constant_nesterov                    123.11s       0.11s       0.0219
 MLP_Adam                                      49.43s       0.26s       0.0224
 MLP_SGD_constant_no_momentum                 532.17s       0.11s       0.0231
 MLP_Adam_early                                19.61s       0.12s       0.0241
 MLP_SGD_adaptive_nesterov_early               57.51s       0.11s       0.0251
 MLP_SGD_constant_nesterov_early               29.66s       0.11s       0.0283
 MLP_SGD_constant_no_momentum_early            95.10s       0.11s       0.0388
 MLP_SGD_invscaling_nesterov                   46.28s       0.14s       0.0785
 MLP_SGD_invscaling_nesterov_early             17.27s       0.12s       0.0817

 with learning_rate_init=0.1 for sgd:
 Classification performance:
 ===========================
 Classifier                              train-time   test-time   error-rate
 ----------------------------------------------------------------------------
 MLP_SGD_constant_momentum                     37.69s       0.13s       0.0170
 MLP_SGD_constant_nesterov                     48.36s       0.13s       0.0171
 MLP_SGD_adaptive_nesterov                     91.48s       0.13s       0.0171
 MLP_SGD_adaptive_nesterov_early               57.50s       0.12s       0.0197
 MLP_SGD_constant_no_momentum                 112.04s       0.13s       0.0204
 MLP_SGD_constant_nesterov_early               19.50s       0.14s       0.0213
 MLP_Adam                                      55.58s       0.14s       0.0224
 MLP_SGD_constant_no_momentum_early            39.07s       0.13s       0.0229
 MLP_Adam_early                                22.73s       0.13s       0.0241
 MLP_SGD_invscaling_nesterov                  107.90s       0.15s       0.0304
 MLP_SGD_invscaling_nesterov_early             44.23s       0.16s       0.0345

 """

 from __future__ import print_function
 import os
 import cPickle as pickle
 from time import time
 import argparse
 import numpy as np
 import matplotlib.pyplot as plt

 from sklearn.datasets import fetch_mldata
 from sklearn.datasets import get_data_home
 from sklearn.externals.joblib import Memory
 from sklearn.metrics import zero_one_loss
 from sklearn.utils import check_array
 from sklearn.neural_network import MLPClassifier

 # Memoize the data extraction and memory map the resulting
 # train / test splits in readonly mode
 memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


 @memory.cache
 def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test


 def make_plots(loss, val_loss):
    non_early = [name for name in loss if not name.endswith('_early')]
    early = [name for name in loss if name.endswith('_early')]

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    # not including MLP_SGD_constant_no_momentum because the number of
    # iterations is too large
    make_sub_plot({name: loss[name] for name in non_early
                   if name != 'MLP_SGD_constant_no_momentum'}, axes.ravel()[0])
    for name, ax in zip(early, axes.ravel()[1:]):
        make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name],
                       name + '_val': val_loss[name]}, ax)

    plt.subplots_adjust(hspace=0.45)
    plt.subplots_adjust(top=0.8)
    plt.show()


 def make_sub_plot(loss, ax):
    plot_args = [{'c': 'red', 'linestyle': '-'},
                 {'c': 'green', 'linestyle': '-'},
                 {'c': 'blue', 'linestyle': '-'},
                 {'c': 'red', 'linestyle': '--'},
                 {'c': 'green', 'linestyle': '--'},
                 {'c': 'blue', 'linestyle': '--'}]
    for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args):
        ax.plot(loss_curve, label=label, **args)
    if len(loss) > 3:
        ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
                  bbox_to_anchor=(0.95, 1.30), fontsize=11)
    else:
        ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
                  bbox_to_anchor=(1.05, 1.20), fontsize=11)


 ESTIMATORS = {
    'MLP_SGD_constant_no_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'MLP_SGD_constant_no_momentum_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False,
        early_stopping=True),
    'MLP_SGD_constant_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'MLP_SGD_constant_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
    'MLP_SGD_constant_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        early_stopping=True),
    'MLP_SGD_invscaling_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling'),
    'MLP_SGD_invscaling_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling', early_stopping=True),
    'MLP_SGD_adaptive_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive'),
    'MLP_SGD_adaptive_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive', early_stopping=True),
    'MLP_Adam': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1),
    'MLP_Adam_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1, early_stopping=True),
 }


 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--classifiers', nargs="+",
                        choices=ESTIMATORS.keys() + ['all'], type=str,
                        default=['MLP_SGD_constant_no_momentum',
                                 'MLP_SGD_constant_momentum',
                                 'MLP_SGD_constant_nesterov',
                                 'MLP_Adam', 'MLP_Adam_early',
                                 'MLP_SGD_constant_nesterov_early'],
                        help="list of classifiers to benchmark.")
    parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
                        help="Number of concurrently running workers for "
                             "models that support parallelism.")
    parser.add_argument('--order', nargs="?", default="C", type=str,
                        choices=["F", "C"],
                        help="Allow to choose between fortran and C ordered "
                             "data")
    parser.add_argument('--random-seed', nargs="?", default=0, type=int,
                        help="Common seed used by random number generator.")
    args = vars(parser.parse_args())

    print(__doc__)

    X_train, X_test, y_train, y_test = load_data(order=args["order"])

    print("")
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
    print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
                                 X_train.shape[0], int(X_train.nbytes / 1e6)))
    print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
                                 X_test.shape[0], int(X_test.nbytes / 1e6)))

    print()
    print("Training Classifiers")
    print("====================")
    error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
    if 'all' in args['classifiers']:
        args['classifiers'] = ESTIMATORS.keys()
    for name in sorted(args["classifiers"]):
        print("Training %s ... " % name, end="")
        estimator = ESTIMATORS[name]
        estimator_params = estimator.get_params()

        estimator.set_params(**{p: args["random_seed"]
                                for p in estimator_params
                                if p.endswith("random_state")})

        if "n_jobs" in estimator_params:
            estimator.set_params(n_jobs=args["n_jobs"])

        time_start = time()
        estimator.fit(X_train, y_train)
        train_time[name] = time() - time_start

        time_start = time()
        y_pred = estimator.predict(X_test)
        test_time[name] = time() - time_start

        error[name] = zero_one_loss(y_test, y_pred)
        loss_curve[name] = estimator.loss_curve_
        val_curve[name] = getattr(estimator, 'validation_scores_', [])

        print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print("{0: <39} {1: >10} {2: >11} {3: >12}"
          "".format("Classifier  ", "train-time", "test-time", "error-rate"))
    print("-" * 76)
    for name in sorted(args["classifiers"], key=error.get):

        print("{0: <40} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}"
              "".format(name, train_time[name], test_time[name], error[name]))

    print()

    with open('loss_history_mnist.pkl', 'wb') as f:
        pickle.dump(loss_curve, f)
    with open('val_loss_history_mnist.pkl', 'wb') as f:
        pickle.dump(val_curve, f)

    make_plots(loss_curve, val_curve)
diff --git a/bench_mlp_plot_compare.py b/bench_mlp_plot_compare.py
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib.colors import ListedColormap
 from sklearn.cross_validation import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import make_moons, make_circles, make_classification
 from sklearn.neural_network import MLPClassifier


 h = .02  # step size in the mesh

 ESTIMATORS = {
    'SGD_constant_no_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'SGD_constant_no_momentum_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False,
        early_stopping=True),
    'SGD_constant_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'SGD_constant_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
    'SGD_constant_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        early_stopping=True),
    'SGD_invscaling_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling'),
    'SGD_invscaling_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling', early_stopping=True),
    'SGD_adaptive_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive'),
    'SGD_adaptive_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive', early_stopping=True),
    'Adam': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1),
    'Adam_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1, early_stopping=True),
 }

 names = ESTIMATORS.keys()
 classifiers = ESTIMATORS.values()

 X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
 rng = np.random.RandomState(2)
 X += 2 * rng.uniform(size=X.shape)
 linearly_separable = (X, y)

 datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]

 figure = plt.figure(figsize=(27, 9))
 i = 1
 # iterate over datasets
 for ds in datasets:
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    cnt = 0
    for name, clf in zip(names, classifiers):
        cnt += 1
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if cnt % 2 == 0:
            ax.set_title(name, fontsize=10, y=1.08)
        else:
            ax.set_title(name, fontsize=10)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

 figure.subplots_adjust(left=.02, right=.98)
 plt.show()
diff --git a/bench_mlp_rcv1.py b/bench_mlp_rcv1.py
 """
 Classification performance:
 ===========================

 Classifier   train-time test-time Hamming Loss
 -------------------------------------------------------------------
 MLP_SGD_constant_no_momentum         5535.1194s  13.0709s    0.0136
 MLP_SGD_adaptive_nesterov_early      1295.6843s  11.7958s    0.0139
 MLP_SGD_constant_nesterov_early      860.3760s   11.6086s    0.0139
 MLP_Adam_early                       570.8964s   11.9987s    0.0141
 MLP_SGD_adaptive_nesterov            5878.9259s  12.5683s    0.0144
 MLP_Adam                             2589.0170s  14.1955s    0.0145
 MLP_SGD_constant_momentum            2408.1225s  11.9756s    0.0145
 MLP_SGD_constant_nesterov            4265.0237s  11.8534s    0.0145
 MLP_SGD_invscaling_nesterov          751.3171s   11.2348s    0.0315
 MLP_SGD_constant_no_momentum_early    50.1751s   11.1671s    0.0320
 MLP_SGD_invscaling_nesterov_early     70.1446s   11.3409s    0.0320
 """

 from __future__ import print_function, division
 from time import time
 import cPickle as pickle
 import argparse
 import numpy as np
 import matplotlib.pyplot as plt

 from sklearn.datasets import fetch_rcv1
 from sklearn.metrics import hamming_loss
 from sklearn.utils.validation import check_array

 from sklearn.neural_network import MLPClassifier


 def make_plots(loss, val_loss):
    non_early = [name for name in loss if not name.endswith('_early')]
    early = [name for name in loss if name.endswith('_early')]

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    make_sub_plot({name: loss[name] for name in non_early}, axes.ravel()[0])
    for name, ax in zip(early, axes.ravel()[1:]):
        make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name],
                       name + '_val': val_loss[name]}, ax)

    plt.subplots_adjust(hspace=0.45)
    plt.subplots_adjust(top=0.8)
    plt.show()


 def make_sub_plot(loss, ax):
    plot_args = [{'c': 'red', 'linestyle': '-'},
                 {'c': 'green', 'linestyle': '-'},
                 {'c': 'blue', 'linestyle': '-'},
                 {'c': 'red', 'linestyle': '--'},
                 {'c': 'green', 'linestyle': '--'},
                 {'c': 'blue', 'linestyle': '--'}]
    for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args):
        ax.plot(loss_curve, label=label, **args)
    if len(loss) > 3:
        ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
                  bbox_to_anchor=(0.95, 1.30), fontsize=11)
    else:
        ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
                  bbox_to_anchor=(1.05, 1.20), fontsize=11)


 ESTIMATORS = {
    'MLP_SGD_constant_no_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'MLP_SGD_constant_no_momentum_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False,
        early_stopping=True),
    'MLP_SGD_constant_momentum': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
        tol=1e-4, random_state=1, nesterovs_momentum=False),
    'MLP_SGD_constant_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
    'MLP_SGD_constant_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        early_stopping=True),
    'MLP_SGD_invscaling_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling'),
    'MLP_SGD_invscaling_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='invscaling', early_stopping=True),
    'MLP_SGD_adaptive_nesterov': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive'),
    'MLP_SGD_adaptive_nesterov_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
        nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
        learning_rate='adaptive', early_stopping=True),
    'MLP_Adam': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1),
    'MLP_Adam_early': MLPClassifier(
        hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
        algorithm='adam', learning_rate_init=0.001, verbose=1,
        tol=1e-4, random_state=1, early_stopping=True),
 }


 ###############################################################################
 # Data

 if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--estimators', nargs="+", required=True,
                        choices=ESTIMATORS.keys() + ['all'])
    args = vars(parser.parse_args())

    data_train = fetch_rcv1(subset="train", shuffle=True, random_state=1)
    data_test = fetch_rcv1(subset="test", shuffle=True, random_state=1)
    X_train = check_array(data_train.data, dtype=np.float32,
                          accept_sparse="csr")
    X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
    y_train = data_train.target
    y_test = data_test.target

    print("rcv1")
    print("=============")
    print("X_train.shape = {0}".format(X_train.shape))
    print("X_train.format = {0}".format(X_train.format))
    print("X_train.dtype = {0}".format(X_train.dtype))
    print("X_train density = {0}"
          "".format(X_train.nnz / np.product(X_train.shape)))
    print("y_train {0}".format(y_train.shape))
    print("X_test {0}".format(X_test.shape))
    print("X_test.format = {0}".format(X_test.format))
    print("X_test.dtype = {0}".format(X_test.dtype))
    print("y_test {0}".format(y_test.shape))
    print()

    print("Classifier Training")
    print("===================")

    if 'all' in args['estimators']:
        args['estimators'] = ESTIMATORS.keys()

    hmg_loss, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
    for name in sorted(args["estimators"]):
        clf = ESTIMATORS[name]
        try:
            clf.set_params(random_state=0)
        except (TypeError, ValueError):
            pass

        print("Training %s ... " % name, end="")
        t0 = time()
        clf.fit(X_train, y_train)
        train_time[name] = time() - t0
        t0 = time()
        y_pred = clf.predict(X_test)
        test_time[name] = time() - t0
        hmg_loss[name] = hamming_loss(y_test, y_pred)
        loss_curve[name] = clf.loss_curve_
        val_curve[name] = getattr(clf, 'validation_scores_', [])
        print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print()
    print("%s %s %s %s" % ("Classifier  ", "train-time", "test-time",
                           "Hamming Loss"))
    print("-" * 67)
    for name in sorted(hmg_loss, key=hmg_loss.get):
        print("%s %s %s %s" % (name.ljust(36),
                               ("%.4fs" % train_time[name]).center(10),
                               ("%.4fs" % test_time[name]).center(10),
                               ("%.4f" % hmg_loss[name]).center(10)))

    print()

    with open('loss_history_rcv1.pkl', 'wb') as f:
        pickle.dump(loss_curve, f)
    with open('val_loss_history_rcv1.pkl', 'wb') as f:
        pickle.dump(val_curve, f)

    make_plots(loss_curve, val_curve)
diff --git a/compare_dots.py b/compare_dots.py
 """
        mean            std
 Sparse: 0.0586632259687 0.00355379776739
 np.dot: 0.0561765789986 0.00206648457631
 dummy:  0.0553110162417 0.00247213620297
 """

 from __future__ import print_function
 import sys
 from scipy.sparse import issparse
 from sklearn.utils.extmath import safe_sparse_dot
 import numpy as np
 import time


 def dummy_dot(a, b):
    if issparse(a) or issparse(b):
        raise ValueError
    else:
        return np.dot(a, b)


 def compare():
    tests = [(np.random.rand(1000,10000), np.random.rand(10000)) for i in range(10)]

    start = time.time()
    for a, b in tests:
        safe_sparse_dot(a, b)
    elapsed_sparse = time.time() - start

    start = time.time()
    for a, b in tests:
        np.dot(a, b)
    elapsed_npdot = time.time() - start

    start = time.time()
    for a, b in tests:
        dummy_dot(a, b)
    elapsed_dummy = time.time() - start
    
    return elapsed_sparse, elapsed_npdot, elapsed_dummy


 def main():
    times = []
    n = 300
    for i in range(n):
        times.append(compare())
        sys.stdout.write('\rFinished {} out of {}'.format(i+1, n))
        sys.stdout.flush()

    times_sparse, times_npdot, times_dummy = map(np.array, zip(*times))
    avg_sparse, std_sparse = times_sparse.mean(), times_sparse.std()
    avg_npdot, std_npdot = times_npdot.mean(), times_npdot.std()
    avg_dummy, std_dummy = times_dummy.mean(), times_dummy.std()

    print()
    print("        mean            std")
    print("Sparse:", avg_sparse, std_sparse)
    print("np.dot:", avg_npdot, std_npdot)
    print("dummy: ", avg_dummy, std_dummy)

 if __name__ == '__main__':
    main()
	import numpy as np
	from time import time
	import matplotlib.pyplot as plt
	from matplotlib.colors import ListedColormap
	from sklearn.cross_validation import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.datasets import make_moons, make_circles, make_classification
	from sklearn.neural_network import MLPClassifier


	h = .02 # step size in the mesh

	ESTIMATORS = {
	'Adam': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1),
	'Adam_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1, early_stopping=True),
	'l-bfgs': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='l-bfgs', learning_rate_init=0.01, verbose=1,
	tol=1e-4, random_state=1, early_stopping=False),

	}

	names = ESTIMATORS.keys()
	classifiers = ESTIMATORS.values()


	def make_datasets(n_samples=100):
	X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
	random_state=1, n_clusters_per_class=1,
	n_samples=n_samples)
	rng = np.random.RandomState(2)
	X += 2 * rng.uniform(size=X.shape)
	linearly_separable = (X, y)

	datasets = [make_moons(noise=0.3, random_state=0, n_samples=n_samples),
	make_circles(noise=0.2, factor=0.5, random_state=1,
	n_samples=n_samples),
	linearly_separable]
	return datasets


	figure = plt.figure(figsize=(27, 9))
	i = 0
	# iterate over datasets
	sample_sizes = range(100, 1000, 400)
	datasets = []
	for n_samples in sample_sizes:
	datasets += make_datasets(n_samples)

	for j, ds in enumerate(datasets):
	# preprocess dataset, split into training and test part
	X, y = ds
	X = StandardScaler().fit_transform(X)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

	x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
	y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
	np.arange(y_min, y_max, h))

	# just plot the dataset first
	cm = plt.cm.RdBu
	cm_bright = ListedColormap(['#FF0000', '#0000FF'])
	ax = plt.subplot(len(classifiers) + 1, len(datasets),
	i % (len(classifiers) + 1) * len(datasets) + j + 1)

	# Plot the training points
	ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
	# and testing points
	ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
	alpha=0.6)
	ax.set_xlim(xx.min(), xx.max())
	ax.set_ylim(yy.min(), yy.max())
	ax.set_xticks(())
	ax.set_yticks(())
	ax.set_title(str(len(y)), fontsize=10)
	i += 1

	# iterate over classifiers
	cnt = 0
	for name, clf in zip(names, classifiers):
	cnt += 1
	ax = plt.subplot(len(classifiers) + 1, len(datasets),
	i % (len(classifiers) + 1) * len(datasets) + j + 1)
	time_start = time()
	clf.fit(X_train, y_train)
	train_time = time() - time_start
	score = clf.score(X_test, y_test)

	# Plot the decision boundary. For that, we will assign a color to
	# each point in the mesh [x_min, m_max]x[y_min, y_max].
	if hasattr(clf, "decision_function"):
	Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
	else:
	Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

	# Put the result into a color plot
	Z = Z.reshape(xx.shape)
	ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

	# Plot also the training points
	ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
	# and testing points
	ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
	alpha=0.6)

	ax.set_xlim(xx.min(), xx.max())
	ax.set_ylim(yy.min(), yy.max())
	ax.set_xticks(())
	ax.set_yticks(())
	ax.set_title(name, fontsize=10)

	ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
	size=15, horizontalalignment='right')
	ax.text(xx.min() + .3, yy.min() + .3,
	('%.3f' % train_time).lstrip('0'),
	size=15, horizontalalignment='left')

	i += 1

	figure.subplots_adjust(left=.02, right=.98)
	plt.show()
	"""
	Benchmarking adam and lbfgs on Boston dataset

	Regression performance:
	===========================
	Regressor train-time test-time test-score
	----------------------------------------------------------------------------
	adam 0.3896s 0.0003s 0.8606
	l-bfgs 0.5861s 0.0003s 0.8689
	adam-early 0.6177s 0.0004s 0.8750
	"""
	from __future__ import print_function
	import numpy as np
	from time import time
	import argparse
	from sklearn import datasets
	from sklearn.preprocessing import StandardScaler
	from sklearn.cross_validation import train_test_split
	from sklearn.neural_network import MLPRegressor


	# import some data to play with
	def load_data():
	dataset = datasets.load_boston()
	X = dataset.data # we only take the first two features.
	X = StandardScaler().fit_transform(X)
	y = dataset.target
	train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,
	random_state=1)
	scaler = StandardScaler()
	train_X = scaler.fit_transform(train_X)
	test_X = scaler.transform(test_X)
	return train_X, test_X, train_y, test_y


	ESTIMATORS = {'adam': MLPRegressor(random_state=1,
	hidden_layer_sizes=(100, 100)),
	'adam-early': MLPRegressor(random_state=1, early_stopping=True,
	hidden_layer_sizes=(100, 100)),
	'l-bfgs': MLPRegressor(algorithm='l-bfgs', random_state=1,
	hidden_layer_sizes=(100, 100))}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('--estimators', nargs="+",
	choices=ESTIMATORS.keys() + ['all'], type=str,
	default=['adam', 'adam-early', 'l-bfgs'],
	help="list of classifiers to benchmark.")
	parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
	help="Number of concurrently running workers for "
	"models that support parallelism.")
	parser.add_argument('--random-seed', nargs="?", default=0, type=int,
	help="Common seed used by random number generator.")
	args = vars(parser.parse_args())

	print(__doc__)

	X_train, X_test, y_train, y_test = load_data()

	print("")
	print("Dataset statistics:")
	print("===================")
	print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
	print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
	print("%s %s" % ("data type:".ljust(25), X_train.dtype))
	print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
	X_train.shape[0], int(X_train.nbytes / 1e6)))
	print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
	X_test.shape[0], int(X_test.nbytes / 1e6)))

	print()
	print("Training Estimators")
	print("====================")
	error, train_time, test_time = {}, {}, {}
	if 'all' in args['estimators']:
	args['estimators'] = ESTIMATORS.keys()
	for name in sorted(args["estimators"]):
	print("Training %s ... " % name, end="")
	estimator = ESTIMATORS[name]
	estimator_params = estimator.get_params()

	estimator.set_params(**{p: args["random_seed"]
	for p in estimator_params
	if p.endswith("random_state")})

	if "n_jobs" in estimator_params:
	estimator.set_params(n_jobs=args["n_jobs"])

	time_start = time()
	estimator.fit(X_train, y_train)
	train_time[name] = time() - time_start

	time_start = time()
	y_pred = estimator.predict(X_test)
	test_time[name] = time() - time_start

	error[name] = estimator.score(X_test, y_test)

	print("done")

	print()
	print("Regression performance:")
	print("===========================")
	print("{0: <23} {1: >10} {2: >11} {3: >12}"
	"".format("Regressor ", "train-time", "test-time",
	"test-score"))
	print("-" * 76)
	for name in sorted(args["estimators"], key=error.get):

	print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
	"".format(name, train_time[name], test_time[name], error[name]))

	print()
	"""
	Benchmarking adam and lbfgs on Diabetes dataset

	Regression performance:
	===========================
	Regressor train-time test-time test-score
	----------------------------------------------------------------------------
	adam-early 0.3612s 0.0002s 0.2961
	adam 0.4856s 0.0003s 0.3538
	l-bfgs 0.4855s 0.0003s 0.4170
	"""
	from __future__ import print_function
	import numpy as np
	from time import time
	import argparse
	from sklearn import datasets
	from sklearn.cross_validation import train_test_split
	from sklearn.neural_network import MLPRegressor


	# import some data to play with
	def load_data():
	iris = datasets.load_diabetes()
	X = iris.data # we only take the first two features.
	y = iris.target
	return train_test_split(X, y, test_size=0.2, random_state=1)


	ESTIMATORS = {'adam': MLPRegressor(random_state=1,
	hidden_layer_sizes=(100, 100)),
	'adam-early': MLPRegressor(random_state=1, early_stopping=True,
	hidden_layer_sizes=(100, 100)),
	'l-bfgs': MLPRegressor(algorithm='l-bfgs', random_state=1,
	hidden_layer_sizes=(100, 100))}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('--estimators', nargs="+",
	choices=ESTIMATORS.keys() + ['all'], type=str,
	default=['adam', 'adam-early', 'l-bfgs'],
	help="list of classifiers to benchmark.")
	parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
	help="Number of concurrently running workers for "
	"models that support parallelism.")
	parser.add_argument('--random-seed', nargs="?", default=0, type=int,
	help="Common seed used by random number generator.")
	args = vars(parser.parse_args())

	print(__doc__)

	X_train, X_test, y_train, y_test = load_data()

	print("")
	print("Dataset statistics:")
	print("===================")
	print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
	print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
	print("%s %s" % ("data type:".ljust(25), X_train.dtype))
	print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
	X_train.shape[0], int(X_train.nbytes / 1e6)))
	print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
	X_test.shape[0], int(X_test.nbytes / 1e6)))

	print()
	print("Training Estimators")
	print("====================")
	error, train_time, test_time = {}, {}, {}
	if 'all' in args['estimators']:
	args['estimators'] = ESTIMATORS.keys()
	for name in sorted(args["estimators"]):
	print("Training %s ... " % name, end="")
	estimator = ESTIMATORS[name]
	estimator_params = estimator.get_params()

	estimator.set_params(**{p: args["random_seed"]
	for p in estimator_params
	if p.endswith("random_state")})

	if "n_jobs" in estimator_params:
	estimator.set_params(n_jobs=args["n_jobs"])

	time_start = time()
	estimator.fit(X_train, y_train)
	train_time[name] = time() - time_start

	time_start = time()
	y_pred = estimator.predict(X_test)
	test_time[name] = time() - time_start

	error[name] = estimator.score(X_test, y_test)

	print("done")

	print()
	print("Regression performance:")
	print("===========================")
	print("{0: <23} {1: >10} {2: >11} {3: >12}"
	"".format("Regressor ", "train-time", "test-time",
	"test-score"))
	print("-" * 76)
	for name in sorted(args["estimators"], key=error.get):

	print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
	"".format(name, train_time[name], test_time[name], error[name]))

	print()
	"""
	Classification performance:
	===========================
	Classifier train-time test-time error-rate
	----------------------------------------------------------------------------
	adam 1.1049s 0.0010s 0.0167
	l-bfgs 0.0910s 0.0008s 0.0306
	adam-early 0.1354s 0.0009s 0.0528
	"""
	from __future__ import print_function
	import numpy as np
	from time import time
	import argparse
	from sklearn import datasets
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import zero_one_loss
	from sklearn.cross_validation import train_test_split
	from sklearn.neural_network import MLPClassifier


	def load_data():
	dataset = datasets.load_digits()
	X = dataset.data # we only take the first two features.
	X = StandardScaler().fit_transform(X)
	y = dataset.target
	train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,
	random_state=1)
	scaler = StandardScaler()
	train_X = scaler.fit_transform(train_X)
	test_X = scaler.transform(test_X)
	return train_X, test_X, train_y, test_y


	ESTIMATORS = {'adam': MLPClassifier(random_state=1),
	'adam-early': MLPClassifier(random_state=1, early_stopping=True),
	'l-bfgs': MLPClassifier(algorithm='l-bfgs', random_state=1)}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('--classifiers', nargs="+",
	choices=ESTIMATORS.keys() + ['all'], type=str,
	default=['adam', 'adam-early', 'l-bfgs'],
	help="list of classifiers to benchmark.")
	parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
	help="Number of concurrently running workers for "
	"models that support parallelism.")
	parser.add_argument('--random-seed', nargs="?", default=0, type=int,
	help="Common seed used by random number generator.")
	args = vars(parser.parse_args())

	print(__doc__)

	X_train, X_test, y_train, y_test = load_data()

	print("")
	print("Dataset statistics:")
	print("===================")
	print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
	print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
	print("%s %s" % ("data type:".ljust(25), X_train.dtype))
	print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
	X_train.shape[0], int(X_train.nbytes / 1e6)))
	print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
	X_test.shape[0], int(X_test.nbytes / 1e6)))

	print()
	print("Training Classifiers")
	print("====================")
	error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
	if 'all' in args['classifiers']:
	args['classifiers'] = ESTIMATORS.keys()
	for name in sorted(args["classifiers"]):
	print("Training %s ... " % name, end="")
	estimator = ESTIMATORS[name]
	estimator_params = estimator.get_params()

	estimator.set_params(**{p: args["random_seed"]
	for p in estimator_params
	if p.endswith("random_state")})

	if "n_jobs" in estimator_params:
	estimator.set_params(n_jobs=args["n_jobs"])

	time_start = time()
	estimator.fit(X_train, y_train)
	train_time[name] = time() - time_start

	time_start = time()
	y_pred = estimator.predict(X_test)
	test_time[name] = time() - time_start

	error[name] = zero_one_loss(y_test, y_pred)

	print("done")

	print()
	print("Classification performance:")
	print("===========================")
	print("{0: <23} {1: >10} {2: >11} {3: >12}"
	"".format("Classifier ", "train-time", "test-time", "error-rate"))
	print("-" * 76)
	for name in sorted(args["classifiers"], key=error.get):

	print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}"
	"".format(name, train_time[name], test_time[name], error[name]))

	print()
	"""
	Benchmarking MLP Performances on 20NewGroup dataset

	Classification performance:
	===========================

	Classifier train-time test-time Accuracy
	-------------------------------------------------------------------
	MLP_SGD_constant_no_momentum_early 61.9779s 0.1686s 0.0333
	MLP_SGD_invscaling_nesterov 143.6211s 0.1713s 0.0506
	MLP_SGD_invscaling_nesterov_early 212.4194s 0.1691s 0.0769
	MLP_SGD_constant_no_momentum 6882.5231s 0.1800s 0.5842
	MLP_SGD_constant_nesterov_early 1871.2481s 0.1793s 0.7302
	MLP_SGD_adaptive_nesterov_early 2395.0412s 0.1822s 0.7382
	MLP_SGD_constant_nesterov 5725.5833s 0.1799s 0.7649
	MLP_SGD_adaptive_nesterov 6263.0604s 0.1733s 0.7678
	MLP_SGD_constant_momentum 4174.8977s 0.1666s 0.7678
	MLP_Adam 1395.2267s 0.1822s 0.8314
	MLP_Adam_early 528.2558s 0.1718s 0.8330

	with learning_rate_init=0.1 for sgd:
	Classification performance:
	===========================

	Classifier train-time test-time Accuracy
	-------------------------------------------------------------------
	MLP_SGD_invscaling_nesterov_early 96.1587s 0.1757s 0.1032
	MLP_SGD_invscaling_nesterov 11964.9086s 0.2026s 0.1374
	MLP_SGD_constant_no_momentum_early 326.6792s 0.2224s 0.1620
	MLP_SGD_constant_nesterov_early 407.4799s 0.2217s 0.6190
	MLP_SGD_adaptive_nesterov_early 2152.5739s 0.2308s 0.7338
	MLP_SGD_constant_momentum 1169.3004s 0.2609s 0.7617
	MLP_SGD_constant_nesterov 1634.3541s 0.2659s 0.7681
	MLP_SGD_constant_no_momentum 5292.0535s 0.2432s 0.7747
	MLP_SGD_adaptive_nesterov 2166.2128s 0.2393s 0.7781
	MLP_Adam 1740.3218s 0.2305s 0.8314
	MLP_Adam_early 636.6075s 0.1822s 0.8330
	"""

	from __future__ import print_function, division
	from time import time
	import cPickle as pickle
	import argparse
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.datasets import fetch_20newsgroups_vectorized
	from sklearn.metrics import accuracy_score
	from sklearn.utils.validation import check_array

	from sklearn.neural_network import MLPClassifier


	def make_plots(loss, val_loss):
	non_early = [name for name in loss if not name.endswith('_early')]
	early = [name for name in loss if name.endswith('_early')]

	fig, axes = plt.subplots(2, 3, figsize=(15, 10))
	make_sub_plot({name: loss[name] for name in non_early}, axes.ravel()[0])
	for name, ax in zip(early, axes.ravel()[1:]):
	make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name],
	name + '_val': val_loss[name]}, ax)

	plt.subplots_adjust(hspace=0.45)
	plt.subplots_adjust(top=0.8)
	plt.show()


	def make_sub_plot(loss, ax):
	plot_args = [{'c': 'red', 'linestyle': '-'},
	{'c': 'green', 'linestyle': '-'},
	{'c': 'blue', 'linestyle': '-'},
	{'c': 'red', 'linestyle': '--'},
	{'c': 'green', 'linestyle': '--'},
	{'c': 'blue', 'linestyle': '--'}]
	for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args):
	ax.plot(loss_curve, label=label, **args)
	if len(loss) > 3:
	ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
	bbox_to_anchor=(0.95, 1.30), fontsize=11)
	else:
	ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
	bbox_to_anchor=(1.05, 1.20), fontsize=11)


	ESTIMATORS = {
	'MLP_SGD_constant_no_momentum': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False),
	'MLP_SGD_constant_no_momentum_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False,
	early_stopping=True),
	'MLP_SGD_constant_momentum': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False),
	'MLP_SGD_constant_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
	'MLP_SGD_constant_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	early_stopping=True),
	'MLP_SGD_invscaling_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='invscaling'),
	'MLP_SGD_invscaling_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='invscaling', early_stopping=True),
	'MLP_SGD_adaptive_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='adaptive'),
	'MLP_SGD_adaptive_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='adaptive', early_stopping=True),
	'MLP_Adam': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1),
	'MLP_Adam_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1, early_stopping=True),
	}


	###############################################################################
	# Data

	if __name__ == "__main__":

	parser = argparse.ArgumentParser()
	parser.add_argument('-e', '--estimators', nargs="+", required=True,
	choices=ESTIMATORS.keys() + ['all'])
	args = vars(parser.parse_args())

	data_train = fetch_20newsgroups_vectorized(subset="train")
	data_test = fetch_20newsgroups_vectorized(subset="test")
	X_train = check_array(data_train.data, dtype=np.float32,
	accept_sparse="csc")
	X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
	y_train = data_train.target
	y_test = data_test.target

	print("20 newsgroups")
	print("=============")
	print("X_train.shape = {0}".format(X_train.shape))
	print("X_train.format = {0}".format(X_train.format))
	print("X_train.dtype = {0}".format(X_train.dtype))
	print("X_train density = {0}"
	"".format(X_train.nnz / np.product(X_train.shape)))
	print("y_train {0}".format(y_train.shape))
	print("X_test {0}".format(X_test.shape))
	print("X_test.format = {0}".format(X_test.format))
	print("X_test.dtype = {0}".format(X_test.dtype))
	print("y_test {0}".format(y_test.shape))
	print()

	print("Classifier Training")
	print("===================")
	accuracy, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
	if 'all' in args['estimators']:
	args['estimators'] = ESTIMATORS.keys()

	for name in sorted(args["estimators"]):
	clf = ESTIMATORS[name]
	try:
	clf.set_params(random_state=0)
	except (TypeError, ValueError):
	pass

	print("Training %s ... " % name, end="")
	t0 = time()
	clf.fit(X_train, y_train)
	train_time[name] = time() - t0
	t0 = time()
	y_pred = clf.predict(X_test)
	test_time[name] = time() - t0
	accuracy[name] = accuracy_score(y_test, y_pred)
	loss_curve[name] = clf.loss_curve_
	val_curve[name] = getattr(clf, 'validation_scores_', [])

	print("done")

	print()
	print("Classification performance:")
	print("===========================")
	print()
	print("%s %s %s %s" % ("Classifier ", "train-time", "test-time",
	"Accuracy"))
	print("-" * 67)
	for name in sorted(accuracy, key=accuracy.get):
	print("%s %s %s %s" % (name.ljust(36),
	("%.4fs" % train_time[name]).center(10),
	("%.4fs" % test_time[name]).center(10),
	("%.4f" % accuracy[name]).center(10)))

	print()

	with open('loss_history_20news.pkl', 'wb') as f:
	pickle.dump(loss_curve, f)
	with open('val_loss_history_20news.pkl', 'wb') as f:
	pickle.dump(val_curve, f)

	make_plots(loss_curve, val_curve)
	"""
	Benchmarking MLP performance on MNIST dataset

	Classification performance:
	===========================
	Classifier train-time test-time error-rate
	----------------------------------------------------------------------------
	MLP_SGD_constant_momentum 105.07s 0.10s 0.0205
	MLP_SGD_adaptive_nesterov 166.20s 0.10s 0.0213
	MLP_SGD_constant_nesterov 123.11s 0.11s 0.0219
	MLP_Adam 49.43s 0.26s 0.0224
	MLP_SGD_constant_no_momentum 532.17s 0.11s 0.0231
	MLP_Adam_early 19.61s 0.12s 0.0241
	MLP_SGD_adaptive_nesterov_early 57.51s 0.11s 0.0251
	MLP_SGD_constant_nesterov_early 29.66s 0.11s 0.0283
	MLP_SGD_constant_no_momentum_early 95.10s 0.11s 0.0388
	MLP_SGD_invscaling_nesterov 46.28s 0.14s 0.0785
	MLP_SGD_invscaling_nesterov_early 17.27s 0.12s 0.0817

	with learning_rate_init=0.1 for sgd:
	Classification performance:
	===========================
	Classifier train-time test-time error-rate
	----------------------------------------------------------------------------
	MLP_SGD_constant_momentum 37.69s 0.13s 0.0170
	MLP_SGD_constant_nesterov 48.36s 0.13s 0.0171
	MLP_SGD_adaptive_nesterov 91.48s 0.13s 0.0171
	MLP_SGD_adaptive_nesterov_early 57.50s 0.12s 0.0197
	MLP_SGD_constant_no_momentum 112.04s 0.13s 0.0204
	MLP_SGD_constant_nesterov_early 19.50s 0.14s 0.0213
	MLP_Adam 55.58s 0.14s 0.0224
	MLP_SGD_constant_no_momentum_early 39.07s 0.13s 0.0229
	MLP_Adam_early 22.73s 0.13s 0.0241
	MLP_SGD_invscaling_nesterov 107.90s 0.15s 0.0304
	MLP_SGD_invscaling_nesterov_early 44.23s 0.16s 0.0345

	"""

	from __future__ import print_function
	import os
	import cPickle as pickle
	from time import time
	import argparse
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.datasets import fetch_mldata
	from sklearn.datasets import get_data_home
	from sklearn.externals.joblib import Memory
	from sklearn.metrics import zero_one_loss
	from sklearn.utils import check_array
	from sklearn.neural_network import MLPClassifier

	# Memoize the data extraction and memory map the resulting
	# train / test splits in readonly mode
	memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
	mmap_mode='r')


	@memory.cache
	def load_data(dtype=np.float32, order='F'):
	"""Load the data, then cache and memmap the train/test split"""
	######################################################################
	# Load dataset
	print("Loading dataset...")
	data = fetch_mldata('MNIST original')
	X = check_array(data['data'], dtype=dtype, order=order)
	y = data["target"]

	# Normalize features
	X = X / 255

	# Create train-test split (as [Joachims, 2006])
	print("Creating train-test split...")
	n_train = 60000
	X_train = X[:n_train]
	y_train = y[:n_train]
	X_test = X[n_train:]
	y_test = y[n_train:]

	return X_train, X_test, y_train, y_test


	def make_plots(loss, val_loss):
	non_early = [name for name in loss if not name.endswith('_early')]
	early = [name for name in loss if name.endswith('_early')]

	fig, axes = plt.subplots(2, 3, figsize=(15, 10))
	# not including MLP_SGD_constant_no_momentum because the number of
	# iterations is too large
	make_sub_plot({name: loss[name] for name in non_early
	if name != 'MLP_SGD_constant_no_momentum'}, axes.ravel()[0])
	for name, ax in zip(early, axes.ravel()[1:]):
	make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name],
	name + '_val': val_loss[name]}, ax)

	plt.subplots_adjust(hspace=0.45)
	plt.subplots_adjust(top=0.8)
	plt.show()


	def make_sub_plot(loss, ax):
	plot_args = [{'c': 'red', 'linestyle': '-'},
	{'c': 'green', 'linestyle': '-'},
	{'c': 'blue', 'linestyle': '-'},
	{'c': 'red', 'linestyle': '--'},
	{'c': 'green', 'linestyle': '--'},
	{'c': 'blue', 'linestyle': '--'}]
	for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args):
	ax.plot(loss_curve, label=label, **args)
	if len(loss) > 3:
	ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
	bbox_to_anchor=(0.95, 1.30), fontsize=11)
	else:
	ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
	bbox_to_anchor=(1.05, 1.20), fontsize=11)


	ESTIMATORS = {
	'MLP_SGD_constant_no_momentum': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False),
	'MLP_SGD_constant_no_momentum_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False,
	early_stopping=True),
	'MLP_SGD_constant_momentum': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False),
	'MLP_SGD_constant_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
	'MLP_SGD_constant_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	early_stopping=True),
	'MLP_SGD_invscaling_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='invscaling'),
	'MLP_SGD_invscaling_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='invscaling', early_stopping=True),
	'MLP_SGD_adaptive_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='adaptive'),
	'MLP_SGD_adaptive_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='adaptive', early_stopping=True),
	'MLP_Adam': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1),
	'MLP_Adam_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1, early_stopping=True),
	}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('--classifiers', nargs="+",
	choices=ESTIMATORS.keys() + ['all'], type=str,
	default=['MLP_SGD_constant_no_momentum',
	'MLP_SGD_constant_momentum',
	'MLP_SGD_constant_nesterov',
	'MLP_Adam', 'MLP_Adam_early',
	'MLP_SGD_constant_nesterov_early'],
	help="list of classifiers to benchmark.")
	parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
	help="Number of concurrently running workers for "
	"models that support parallelism.")
	parser.add_argument('--order', nargs="?", default="C", type=str,
	choices=["F", "C"],
	help="Allow to choose between fortran and C ordered "
	"data")
	parser.add_argument('--random-seed', nargs="?", default=0, type=int,
	help="Common seed used by random number generator.")
	args = vars(parser.parse_args())

	print(__doc__)

	X_train, X_test, y_train, y_test = load_data(order=args["order"])

	print("")
	print("Dataset statistics:")
	print("===================")
	print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
	print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
	print("%s %s" % ("data type:".ljust(25), X_train.dtype))
	print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25),
	X_train.shape[0], int(X_train.nbytes / 1e6)))
	print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25),
	X_test.shape[0], int(X_test.nbytes / 1e6)))

	print()
	print("Training Classifiers")
	print("====================")
	error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
	if 'all' in args['classifiers']:
	args['classifiers'] = ESTIMATORS.keys()
	for name in sorted(args["classifiers"]):
	print("Training %s ... " % name, end="")
	estimator = ESTIMATORS[name]
	estimator_params = estimator.get_params()

	estimator.set_params(**{p: args["random_seed"]
	for p in estimator_params
	if p.endswith("random_state")})

	if "n_jobs" in estimator_params:
	estimator.set_params(n_jobs=args["n_jobs"])

	time_start = time()
	estimator.fit(X_train, y_train)
	train_time[name] = time() - time_start

	time_start = time()
	y_pred = estimator.predict(X_test)
	test_time[name] = time() - time_start

	error[name] = zero_one_loss(y_test, y_pred)
	loss_curve[name] = estimator.loss_curve_
	val_curve[name] = getattr(estimator, 'validation_scores_', [])

	print("done")

	print()
	print("Classification performance:")
	print("===========================")
	print("{0: <39} {1: >10} {2: >11} {3: >12}"
	"".format("Classifier ", "train-time", "test-time", "error-rate"))
	print("-" * 76)
	for name in sorted(args["classifiers"], key=error.get):

	print("{0: <40} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}"
	"".format(name, train_time[name], test_time[name], error[name]))

	print()

	with open('loss_history_mnist.pkl', 'wb') as f:
	pickle.dump(loss_curve, f)
	with open('val_loss_history_mnist.pkl', 'wb') as f:
	pickle.dump(val_curve, f)

	make_plots(loss_curve, val_curve)
	"""
	Classification performance:
	===========================

	Classifier train-time test-time Hamming Loss
	-------------------------------------------------------------------
	MLP_SGD_constant_no_momentum 5535.1194s 13.0709s 0.0136
	MLP_SGD_adaptive_nesterov_early 1295.6843s 11.7958s 0.0139
	MLP_SGD_constant_nesterov_early 860.3760s 11.6086s 0.0139
	MLP_Adam_early 570.8964s 11.9987s 0.0141
	MLP_SGD_adaptive_nesterov 5878.9259s 12.5683s 0.0144
	MLP_Adam 2589.0170s 14.1955s 0.0145
	MLP_SGD_constant_momentum 2408.1225s 11.9756s 0.0145
	MLP_SGD_constant_nesterov 4265.0237s 11.8534s 0.0145
	MLP_SGD_invscaling_nesterov 751.3171s 11.2348s 0.0315
	MLP_SGD_constant_no_momentum_early 50.1751s 11.1671s 0.0320
	MLP_SGD_invscaling_nesterov_early 70.1446s 11.3409s 0.0320
	"""

	from __future__ import print_function, division
	from time import time
	import cPickle as pickle
	import argparse
	import numpy as np
	import matplotlib.pyplot as plt

	from sklearn.datasets import fetch_rcv1
	from sklearn.metrics import hamming_loss
	from sklearn.utils.validation import check_array

	from sklearn.neural_network import MLPClassifier


	def make_plots(loss, val_loss):
	non_early = [name for name in loss if not name.endswith('_early')]
	early = [name for name in loss if name.endswith('_early')]

	fig, axes = plt.subplots(2, 3, figsize=(15, 10))
	make_sub_plot({name: loss[name] for name in non_early}, axes.ravel()[0])
	for name, ax in zip(early, axes.ravel()[1:]):
	make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name],
	name + '_val': val_loss[name]}, ax)

	plt.subplots_adjust(hspace=0.45)
	plt.subplots_adjust(top=0.8)
	plt.show()


	def make_sub_plot(loss, ax):
	plot_args = [{'c': 'red', 'linestyle': '-'},
	{'c': 'green', 'linestyle': '-'},
	{'c': 'blue', 'linestyle': '-'},
	{'c': 'red', 'linestyle': '--'},
	{'c': 'green', 'linestyle': '--'},
	{'c': 'blue', 'linestyle': '--'}]
	for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args):
	ax.plot(loss_curve, label=label, **args)
	if len(loss) > 3:
	ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
	bbox_to_anchor=(0.95, 1.30), fontsize=11)
	else:
	ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right',
	bbox_to_anchor=(1.05, 1.20), fontsize=11)


	ESTIMATORS = {
	'MLP_SGD_constant_no_momentum': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False),
	'MLP_SGD_constant_no_momentum_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False,
	early_stopping=True),
	'MLP_SGD_constant_momentum': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1,
	tol=1e-4, random_state=1, nesterovs_momentum=False),
	'MLP_SGD_constant_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1),
	'MLP_SGD_constant_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	early_stopping=True),
	'MLP_SGD_invscaling_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='invscaling'),
	'MLP_SGD_invscaling_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='invscaling', early_stopping=True),
	'MLP_SGD_adaptive_nesterov': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='adaptive'),
	'MLP_SGD_adaptive_nesterov_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='sgd', learning_rate_init=0.01, momentum=0.9,
	nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1,
	learning_rate='adaptive', early_stopping=True),
	'MLP_Adam': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1),
	'MLP_Adam_early': MLPClassifier(
	hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
	algorithm='adam', learning_rate_init=0.001, verbose=1,
	tol=1e-4, random_state=1, early_stopping=True),
	}


	###############################################################################
	# Data

	if __name__ == "__main__":

	parser = argparse.ArgumentParser()
	parser.add_argument('-e', '--estimators', nargs="+", required=True,
	choices=ESTIMATORS.keys() + ['all'])
	args = vars(parser.parse_args())

	data_train = fetch_rcv1(subset="train", shuffle=True, random_state=1)
	data_test = fetch_rcv1(subset="test", shuffle=True, random_state=1)
	X_train = check_array(data_train.data, dtype=np.float32,
	accept_sparse="csr")
	X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr")
	y_train = data_train.target
	y_test = data_test.target

	print("rcv1")
	print("=============")
	print("X_train.shape = {0}".format(X_train.shape))
	print("X_train.format = {0}".format(X_train.format))
	print("X_train.dtype = {0}".format(X_train.dtype))
	print("X_train density = {0}"
	"".format(X_train.nnz / np.product(X_train.shape)))
	print("y_train {0}".format(y_train.shape))
	print("X_test {0}".format(X_test.shape))
	print("X_test.format = {0}".format(X_test.format))
	print("X_test.dtype = {0}".format(X_test.dtype))
	print("y_test {0}".format(y_test.shape))
	print()

	print("Classifier Training")
	print("===================")

	if 'all' in args['estimators']:
	args['estimators'] = ESTIMATORS.keys()

	hmg_loss, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {}
	for name in sorted(args["estimators"]):
	clf = ESTIMATORS[name]
	try:
	clf.set_params(random_state=0)
	except (TypeError, ValueError):
	pass

	print("Training %s ... " % name, end="")
	t0 = time()
	clf.fit(X_train, y_train)
	train_time[name] = time() - t0
	t0 = time()
	y_pred = clf.predict(X_test)
	test_time[name] = time() - t0
	hmg_loss[name] = hamming_loss(y_test, y_pred)
	loss_curve[name] = clf.loss_curve_
	val_curve[name] = getattr(clf, 'validation_scores_', [])
	print("done")

	print()
	print("Classification performance:")
	print("===========================")
	print()
	print("%s %s %s %s" % ("Classifier ", "train-time", "test-time",
	"Hamming Loss"))
	print("-" * 67)
	for name in sorted(hmg_loss, key=hmg_loss.get):
	print("%s %s %s %s" % (name.ljust(36),
	("%.4fs" % train_time[name]).center(10),
	("%.4fs" % test_time[name]).center(10),
	("%.4f" % hmg_loss[name]).center(10)))

	print()

	with open('loss_history_rcv1.pkl', 'wb') as f:
	pickle.dump(loss_curve, f)
	with open('val_loss_history_rcv1.pkl', 'wb') as f:
	pickle.dump(val_curve, f)

	make_plots(loss_curve, val_curve)
	"""
	mean std
	Sparse: 0.0586632259687 0.00355379776739
	np.dot: 0.0561765789986 0.00206648457631
	dummy: 0.0553110162417 0.00247213620297
	"""

	from __future__ import print_function
	import sys
	from scipy.sparse import issparse
	from sklearn.utils.extmath import safe_sparse_dot
	import numpy as np
	import time


	def dummy_dot(a, b):
	if issparse(a) or issparse(b):
	raise ValueError
	else:
	return np.dot(a, b)


	def compare():
	tests = [(np.random.rand(1000,10000), np.random.rand(10000)) for i in range(10)]

	start = time.time()
	for a, b in tests:
	safe_sparse_dot(a, b)
	elapsed_sparse = time.time() - start

	start = time.time()
	for a, b in tests:
	np.dot(a, b)
	elapsed_npdot = time.time() - start

	start = time.time()
	for a, b in tests:
	dummy_dot(a, b)
	elapsed_dummy = time.time() - start

	return elapsed_sparse, elapsed_npdot, elapsed_dummy


	def main():
	times = []
	n = 300
	for i in range(n):
	times.append(compare())
	sys.stdout.write('\rFinished {} out of {}'.format(i+1, n))
	sys.stdout.flush()

	times_sparse, times_npdot, times_dummy = map(np.array, zip(*times))
	avg_sparse, std_sparse = times_sparse.mean(), times_sparse.std()
	avg_npdot, std_npdot = times_npdot.mean(), times_npdot.std()
	avg_dummy, std_dummy = times_dummy.mean(), times_dummy.std()

	print()
	print(" mean std")
	print("Sparse:", avg_sparse, std_sparse)
	print("np.dot:", avg_npdot, std_npdot)
	print("dummy: ", avg_dummy, std_dummy)

	if __name__ == '__main__':
	main()