Last active
December 27, 2017 03:30
-
-
Save glennq/44ca8b66770430ee10f9 to your computer and use it in GitHub Desktop.
Benchmarks for learning rate updating schemes in MLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from time import time | |
import matplotlib.pyplot as plt | |
from matplotlib.colors import ListedColormap | |
from sklearn.cross_validation import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.datasets import make_moons, make_circles, make_classification | |
from sklearn.neural_network import MLPClassifier | |
h = .02 # step size in the mesh | |
ESTIMATORS = { | |
'Adam': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1), | |
'Adam_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1, early_stopping=True), | |
'l-bfgs': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='l-bfgs', learning_rate_init=0.01, verbose=1, | |
tol=1e-4, random_state=1, early_stopping=False), | |
} | |
names = ESTIMATORS.keys() | |
classifiers = ESTIMATORS.values() | |
def make_datasets(n_samples=100): | |
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, | |
random_state=1, n_clusters_per_class=1, | |
n_samples=n_samples) | |
rng = np.random.RandomState(2) | |
X += 2 * rng.uniform(size=X.shape) | |
linearly_separable = (X, y) | |
datasets = [make_moons(noise=0.3, random_state=0, n_samples=n_samples), | |
make_circles(noise=0.2, factor=0.5, random_state=1, | |
n_samples=n_samples), | |
linearly_separable] | |
return datasets | |
figure = plt.figure(figsize=(27, 9)) | |
i = 0 | |
# iterate over datasets | |
sample_sizes = range(100, 1000, 400) | |
datasets = [] | |
for n_samples in sample_sizes: | |
datasets += make_datasets(n_samples) | |
for j, ds in enumerate(datasets): | |
# preprocess dataset, split into training and test part | |
X, y = ds | |
X = StandardScaler().fit_transform(X) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3) | |
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 | |
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), | |
np.arange(y_min, y_max, h)) | |
# just plot the dataset first | |
cm = plt.cm.RdBu | |
cm_bright = ListedColormap(['#FF0000', '#0000FF']) | |
ax = plt.subplot(len(classifiers) + 1, len(datasets), | |
i % (len(classifiers) + 1) * len(datasets) + j + 1) | |
# Plot the training points | |
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) | |
# and testing points | |
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, | |
alpha=0.6) | |
ax.set_xlim(xx.min(), xx.max()) | |
ax.set_ylim(yy.min(), yy.max()) | |
ax.set_xticks(()) | |
ax.set_yticks(()) | |
ax.set_title(str(len(y)), fontsize=10) | |
i += 1 | |
# iterate over classifiers | |
cnt = 0 | |
for name, clf in zip(names, classifiers): | |
cnt += 1 | |
ax = plt.subplot(len(classifiers) + 1, len(datasets), | |
i % (len(classifiers) + 1) * len(datasets) + j + 1) | |
time_start = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - time_start | |
score = clf.score(X_test, y_test) | |
# Plot the decision boundary. For that, we will assign a color to | |
# each point in the mesh [x_min, m_max]x[y_min, y_max]. | |
if hasattr(clf, "decision_function"): | |
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) | |
else: | |
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] | |
# Put the result into a color plot | |
Z = Z.reshape(xx.shape) | |
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) | |
# Plot also the training points | |
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) | |
# and testing points | |
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, | |
alpha=0.6) | |
ax.set_xlim(xx.min(), xx.max()) | |
ax.set_ylim(yy.min(), yy.max()) | |
ax.set_xticks(()) | |
ax.set_yticks(()) | |
ax.set_title(name, fontsize=10) | |
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), | |
size=15, horizontalalignment='right') | |
ax.text(xx.min() + .3, yy.min() + .3, | |
('%.3f' % train_time).lstrip('0'), | |
size=15, horizontalalignment='left') | |
i += 1 | |
figure.subplots_adjust(left=.02, right=.98) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmarking adam and lbfgs on Boston dataset | |
Regression performance: | |
=========================== | |
Regressor train-time test-time test-score | |
---------------------------------------------------------------------------- | |
adam 0.3896s 0.0003s 0.8606 | |
l-bfgs 0.5861s 0.0003s 0.8689 | |
adam-early 0.6177s 0.0004s 0.8750 | |
""" | |
from __future__ import print_function | |
import numpy as np | |
from time import time | |
import argparse | |
from sklearn import datasets | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.cross_validation import train_test_split | |
from sklearn.neural_network import MLPRegressor | |
# import some data to play with | |
def load_data(): | |
dataset = datasets.load_boston() | |
X = dataset.data # we only take the first two features. | |
X = StandardScaler().fit_transform(X) | |
y = dataset.target | |
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, | |
random_state=1) | |
scaler = StandardScaler() | |
train_X = scaler.fit_transform(train_X) | |
test_X = scaler.transform(test_X) | |
return train_X, test_X, train_y, test_y | |
ESTIMATORS = {'adam': MLPRegressor(random_state=1, | |
hidden_layer_sizes=(100, 100)), | |
'adam-early': MLPRegressor(random_state=1, early_stopping=True, | |
hidden_layer_sizes=(100, 100)), | |
'l-bfgs': MLPRegressor(algorithm='l-bfgs', random_state=1, | |
hidden_layer_sizes=(100, 100))} | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--estimators', nargs="+", | |
choices=ESTIMATORS.keys() + ['all'], type=str, | |
default=['adam', 'adam-early', 'l-bfgs'], | |
help="list of classifiers to benchmark.") | |
parser.add_argument('--n-jobs', nargs="?", default=1, type=int, | |
help="Number of concurrently running workers for " | |
"models that support parallelism.") | |
parser.add_argument('--random-seed', nargs="?", default=0, type=int, | |
help="Common seed used by random number generator.") | |
args = vars(parser.parse_args()) | |
print(__doc__) | |
X_train, X_test, y_train, y_test = load_data() | |
print("") | |
print("Dataset statistics:") | |
print("===================") | |
print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) | |
print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) | |
print("%s %s" % ("data type:".ljust(25), X_train.dtype)) | |
print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), | |
X_train.shape[0], int(X_train.nbytes / 1e6))) | |
print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), | |
X_test.shape[0], int(X_test.nbytes / 1e6))) | |
print() | |
print("Training Estimators") | |
print("====================") | |
error, train_time, test_time = {}, {}, {} | |
if 'all' in args['estimators']: | |
args['estimators'] = ESTIMATORS.keys() | |
for name in sorted(args["estimators"]): | |
print("Training %s ... " % name, end="") | |
estimator = ESTIMATORS[name] | |
estimator_params = estimator.get_params() | |
estimator.set_params(**{p: args["random_seed"] | |
for p in estimator_params | |
if p.endswith("random_state")}) | |
if "n_jobs" in estimator_params: | |
estimator.set_params(n_jobs=args["n_jobs"]) | |
time_start = time() | |
estimator.fit(X_train, y_train) | |
train_time[name] = time() - time_start | |
time_start = time() | |
y_pred = estimator.predict(X_test) | |
test_time[name] = time() - time_start | |
error[name] = estimator.score(X_test, y_test) | |
print("done") | |
print() | |
print("Regression performance:") | |
print("===========================") | |
print("{0: <23} {1: >10} {2: >11} {3: >12}" | |
"".format("Regressor ", "train-time", "test-time", | |
"test-score")) | |
print("-" * 76) | |
for name in sorted(args["estimators"], key=error.get): | |
print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}" | |
"".format(name, train_time[name], test_time[name], error[name])) | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmarking adam and lbfgs on Diabetes dataset | |
Regression performance: | |
=========================== | |
Regressor train-time test-time test-score | |
---------------------------------------------------------------------------- | |
adam-early 0.3612s 0.0002s 0.2961 | |
adam 0.4856s 0.0003s 0.3538 | |
l-bfgs 0.4855s 0.0003s 0.4170 | |
""" | |
from __future__ import print_function | |
import numpy as np | |
from time import time | |
import argparse | |
from sklearn import datasets | |
from sklearn.cross_validation import train_test_split | |
from sklearn.neural_network import MLPRegressor | |
# import some data to play with | |
def load_data(): | |
iris = datasets.load_diabetes() | |
X = iris.data # we only take the first two features. | |
y = iris.target | |
return train_test_split(X, y, test_size=0.2, random_state=1) | |
ESTIMATORS = {'adam': MLPRegressor(random_state=1, | |
hidden_layer_sizes=(100, 100)), | |
'adam-early': MLPRegressor(random_state=1, early_stopping=True, | |
hidden_layer_sizes=(100, 100)), | |
'l-bfgs': MLPRegressor(algorithm='l-bfgs', random_state=1, | |
hidden_layer_sizes=(100, 100))} | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--estimators', nargs="+", | |
choices=ESTIMATORS.keys() + ['all'], type=str, | |
default=['adam', 'adam-early', 'l-bfgs'], | |
help="list of classifiers to benchmark.") | |
parser.add_argument('--n-jobs', nargs="?", default=1, type=int, | |
help="Number of concurrently running workers for " | |
"models that support parallelism.") | |
parser.add_argument('--random-seed', nargs="?", default=0, type=int, | |
help="Common seed used by random number generator.") | |
args = vars(parser.parse_args()) | |
print(__doc__) | |
X_train, X_test, y_train, y_test = load_data() | |
print("") | |
print("Dataset statistics:") | |
print("===================") | |
print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) | |
print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) | |
print("%s %s" % ("data type:".ljust(25), X_train.dtype)) | |
print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), | |
X_train.shape[0], int(X_train.nbytes / 1e6))) | |
print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), | |
X_test.shape[0], int(X_test.nbytes / 1e6))) | |
print() | |
print("Training Estimators") | |
print("====================") | |
error, train_time, test_time = {}, {}, {} | |
if 'all' in args['estimators']: | |
args['estimators'] = ESTIMATORS.keys() | |
for name in sorted(args["estimators"]): | |
print("Training %s ... " % name, end="") | |
estimator = ESTIMATORS[name] | |
estimator_params = estimator.get_params() | |
estimator.set_params(**{p: args["random_seed"] | |
for p in estimator_params | |
if p.endswith("random_state")}) | |
if "n_jobs" in estimator_params: | |
estimator.set_params(n_jobs=args["n_jobs"]) | |
time_start = time() | |
estimator.fit(X_train, y_train) | |
train_time[name] = time() - time_start | |
time_start = time() | |
y_pred = estimator.predict(X_test) | |
test_time[name] = time() - time_start | |
error[name] = estimator.score(X_test, y_test) | |
print("done") | |
print() | |
print("Regression performance:") | |
print("===========================") | |
print("{0: <23} {1: >10} {2: >11} {3: >12}" | |
"".format("Regressor ", "train-time", "test-time", | |
"test-score")) | |
print("-" * 76) | |
for name in sorted(args["estimators"], key=error.get): | |
print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}" | |
"".format(name, train_time[name], test_time[name], error[name])) | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Classification performance: | |
=========================== | |
Classifier train-time test-time error-rate | |
---------------------------------------------------------------------------- | |
adam 1.1049s 0.0010s 0.0167 | |
l-bfgs 0.0910s 0.0008s 0.0306 | |
adam-early 0.1354s 0.0009s 0.0528 | |
""" | |
from __future__ import print_function | |
import numpy as np | |
from time import time | |
import argparse | |
from sklearn import datasets | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.metrics import zero_one_loss | |
from sklearn.cross_validation import train_test_split | |
from sklearn.neural_network import MLPClassifier | |
def load_data(): | |
dataset = datasets.load_digits() | |
X = dataset.data # we only take the first two features. | |
X = StandardScaler().fit_transform(X) | |
y = dataset.target | |
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, | |
random_state=1) | |
scaler = StandardScaler() | |
train_X = scaler.fit_transform(train_X) | |
test_X = scaler.transform(test_X) | |
return train_X, test_X, train_y, test_y | |
ESTIMATORS = {'adam': MLPClassifier(random_state=1), | |
'adam-early': MLPClassifier(random_state=1, early_stopping=True), | |
'l-bfgs': MLPClassifier(algorithm='l-bfgs', random_state=1)} | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--classifiers', nargs="+", | |
choices=ESTIMATORS.keys() + ['all'], type=str, | |
default=['adam', 'adam-early', 'l-bfgs'], | |
help="list of classifiers to benchmark.") | |
parser.add_argument('--n-jobs', nargs="?", default=1, type=int, | |
help="Number of concurrently running workers for " | |
"models that support parallelism.") | |
parser.add_argument('--random-seed', nargs="?", default=0, type=int, | |
help="Common seed used by random number generator.") | |
args = vars(parser.parse_args()) | |
print(__doc__) | |
X_train, X_test, y_train, y_test = load_data() | |
print("") | |
print("Dataset statistics:") | |
print("===================") | |
print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) | |
print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) | |
print("%s %s" % ("data type:".ljust(25), X_train.dtype)) | |
print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), | |
X_train.shape[0], int(X_train.nbytes / 1e6))) | |
print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), | |
X_test.shape[0], int(X_test.nbytes / 1e6))) | |
print() | |
print("Training Classifiers") | |
print("====================") | |
error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {} | |
if 'all' in args['classifiers']: | |
args['classifiers'] = ESTIMATORS.keys() | |
for name in sorted(args["classifiers"]): | |
print("Training %s ... " % name, end="") | |
estimator = ESTIMATORS[name] | |
estimator_params = estimator.get_params() | |
estimator.set_params(**{p: args["random_seed"] | |
for p in estimator_params | |
if p.endswith("random_state")}) | |
if "n_jobs" in estimator_params: | |
estimator.set_params(n_jobs=args["n_jobs"]) | |
time_start = time() | |
estimator.fit(X_train, y_train) | |
train_time[name] = time() - time_start | |
time_start = time() | |
y_pred = estimator.predict(X_test) | |
test_time[name] = time() - time_start | |
error[name] = zero_one_loss(y_test, y_pred) | |
print("done") | |
print() | |
print("Classification performance:") | |
print("===========================") | |
print("{0: <23} {1: >10} {2: >11} {3: >12}" | |
"".format("Classifier ", "train-time", "test-time", "error-rate")) | |
print("-" * 76) | |
for name in sorted(args["classifiers"], key=error.get): | |
print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}" | |
"".format(name, train_time[name], test_time[name], error[name])) | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Classification performance: | |
=========================== | |
Classifier train-time test-time error-rate | |
---------------------------------------------------------------------------- | |
adam 0.1003s 0.0002s 0.0333 | |
l-bfgs 0.0344s 0.0001s 0.0333 | |
adam-early 0.0083s 0.0001s 0.5333 | |
""" | |
from __future__ import print_function | |
import numpy as np | |
from time import time | |
import argparse | |
from sklearn import datasets | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.metrics import zero_one_loss | |
from sklearn.cross_validation import train_test_split | |
from sklearn.neural_network import MLPClassifier | |
def load_data(): | |
dataset = datasets.load_iris() | |
X = dataset.data # we only take the first two features. | |
X = StandardScaler().fit_transform(X) | |
y = dataset.target | |
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, | |
random_state=1) | |
scaler = StandardScaler() | |
train_X = scaler.fit_transform(train_X) | |
test_X = scaler.transform(test_X) | |
return train_X, test_X, train_y, test_y | |
ESTIMATORS = {'adam': MLPClassifier(random_state=1), | |
'adam-early': MLPClassifier(random_state=1, early_stopping=True), | |
'l-bfgs': MLPClassifier(algorithm='l-bfgs', random_state=1)} | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--classifiers', nargs="+", | |
choices=ESTIMATORS.keys() + ['all'], type=str, | |
default=['adam', 'adam-early', 'l-bfgs'], | |
help="list of classifiers to benchmark.") | |
parser.add_argument('--n-jobs', nargs="?", default=1, type=int, | |
help="Number of concurrently running workers for " | |
"models that support parallelism.") | |
parser.add_argument('--random-seed', nargs="?", default=0, type=int, | |
help="Common seed used by random number generator.") | |
args = vars(parser.parse_args()) | |
print(__doc__) | |
X_train, X_test, y_train, y_test = load_data() | |
print("") | |
print("Dataset statistics:") | |
print("===================") | |
print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) | |
print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) | |
print("%s %s" % ("data type:".ljust(25), X_train.dtype)) | |
print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), | |
X_train.shape[0], int(X_train.nbytes / 1e6))) | |
print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), | |
X_test.shape[0], int(X_test.nbytes / 1e6))) | |
print() | |
print("Training Classifiers") | |
print("====================") | |
error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {} | |
if 'all' in args['classifiers']: | |
args['classifiers'] = ESTIMATORS.keys() | |
for name in sorted(args["classifiers"]): | |
print("Training %s ... " % name, end="") | |
estimator = ESTIMATORS[name] | |
estimator_params = estimator.get_params() | |
estimator.set_params(**{p: args["random_seed"] | |
for p in estimator_params | |
if p.endswith("random_state")}) | |
if "n_jobs" in estimator_params: | |
estimator.set_params(n_jobs=args["n_jobs"]) | |
time_start = time() | |
estimator.fit(X_train, y_train) | |
train_time[name] = time() - time_start | |
time_start = time() | |
y_pred = estimator.predict(X_test) | |
test_time[name] = time() - time_start | |
error[name] = zero_one_loss(y_test, y_pred) | |
print("done") | |
print() | |
print("Classification performance:") | |
print("===========================") | |
print("{0: <23} {1: >10} {2: >11} {3: >12}" | |
"".format("Classifier ", "train-time", "test-time", "error-rate")) | |
print("-" * 76) | |
for name in sorted(args["classifiers"], key=error.get): | |
print("{0: <24} {1: >10.4f}s {2: >10.4f}s {3: >12.4f}" | |
"".format(name, train_time[name], test_time[name], error[name])) | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmarking MLP Performances on 20NewGroup dataset | |
Classification performance: | |
=========================== | |
Classifier train-time test-time Accuracy | |
------------------------------------------------------------------- | |
MLP_SGD_constant_no_momentum_early 61.9779s 0.1686s 0.0333 | |
MLP_SGD_invscaling_nesterov 143.6211s 0.1713s 0.0506 | |
MLP_SGD_invscaling_nesterov_early 212.4194s 0.1691s 0.0769 | |
MLP_SGD_constant_no_momentum 6882.5231s 0.1800s 0.5842 | |
MLP_SGD_constant_nesterov_early 1871.2481s 0.1793s 0.7302 | |
MLP_SGD_adaptive_nesterov_early 2395.0412s 0.1822s 0.7382 | |
MLP_SGD_constant_nesterov 5725.5833s 0.1799s 0.7649 | |
MLP_SGD_adaptive_nesterov 6263.0604s 0.1733s 0.7678 | |
MLP_SGD_constant_momentum 4174.8977s 0.1666s 0.7678 | |
MLP_Adam 1395.2267s 0.1822s 0.8314 | |
MLP_Adam_early 528.2558s 0.1718s 0.8330 | |
with learning_rate_init=0.1 for sgd: | |
Classification performance: | |
=========================== | |
Classifier train-time test-time Accuracy | |
------------------------------------------------------------------- | |
MLP_SGD_invscaling_nesterov_early 96.1587s 0.1757s 0.1032 | |
MLP_SGD_invscaling_nesterov 11964.9086s 0.2026s 0.1374 | |
MLP_SGD_constant_no_momentum_early 326.6792s 0.2224s 0.1620 | |
MLP_SGD_constant_nesterov_early 407.4799s 0.2217s 0.6190 | |
MLP_SGD_adaptive_nesterov_early 2152.5739s 0.2308s 0.7338 | |
MLP_SGD_constant_momentum 1169.3004s 0.2609s 0.7617 | |
MLP_SGD_constant_nesterov 1634.3541s 0.2659s 0.7681 | |
MLP_SGD_constant_no_momentum 5292.0535s 0.2432s 0.7747 | |
MLP_SGD_adaptive_nesterov 2166.2128s 0.2393s 0.7781 | |
MLP_Adam 1740.3218s 0.2305s 0.8314 | |
MLP_Adam_early 636.6075s 0.1822s 0.8330 | |
""" | |
from __future__ import print_function, division | |
from time import time | |
import cPickle as pickle | |
import argparse | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.datasets import fetch_20newsgroups_vectorized | |
from sklearn.metrics import accuracy_score | |
from sklearn.utils.validation import check_array | |
from sklearn.neural_network import MLPClassifier | |
def make_plots(loss, val_loss): | |
non_early = [name for name in loss if not name.endswith('_early')] | |
early = [name for name in loss if name.endswith('_early')] | |
fig, axes = plt.subplots(2, 3, figsize=(15, 10)) | |
make_sub_plot({name: loss[name] for name in non_early}, axes.ravel()[0]) | |
for name, ax in zip(early, axes.ravel()[1:]): | |
make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name], | |
name + '_val': val_loss[name]}, ax) | |
plt.subplots_adjust(hspace=0.45) | |
plt.subplots_adjust(top=0.8) | |
plt.show() | |
def make_sub_plot(loss, ax): | |
plot_args = [{'c': 'red', 'linestyle': '-'}, | |
{'c': 'green', 'linestyle': '-'}, | |
{'c': 'blue', 'linestyle': '-'}, | |
{'c': 'red', 'linestyle': '--'}, | |
{'c': 'green', 'linestyle': '--'}, | |
{'c': 'blue', 'linestyle': '--'}] | |
for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args): | |
ax.plot(loss_curve, label=label, **args) | |
if len(loss) > 3: | |
ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right', | |
bbox_to_anchor=(0.95, 1.30), fontsize=11) | |
else: | |
ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right', | |
bbox_to_anchor=(1.05, 1.20), fontsize=11) | |
ESTIMATORS = { | |
'MLP_SGD_constant_no_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'MLP_SGD_constant_no_momentum_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False, | |
early_stopping=True), | |
'MLP_SGD_constant_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'MLP_SGD_constant_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1), | |
'MLP_SGD_constant_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
early_stopping=True), | |
'MLP_SGD_invscaling_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling'), | |
'MLP_SGD_invscaling_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling', early_stopping=True), | |
'MLP_SGD_adaptive_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive'), | |
'MLP_SGD_adaptive_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive', early_stopping=True), | |
'MLP_Adam': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1), | |
'MLP_Adam_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1, early_stopping=True), | |
} | |
############################################################################### | |
# Data | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-e', '--estimators', nargs="+", required=True, | |
choices=ESTIMATORS.keys() + ['all']) | |
args = vars(parser.parse_args()) | |
data_train = fetch_20newsgroups_vectorized(subset="train") | |
data_test = fetch_20newsgroups_vectorized(subset="test") | |
X_train = check_array(data_train.data, dtype=np.float32, | |
accept_sparse="csc") | |
X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") | |
y_train = data_train.target | |
y_test = data_test.target | |
print("20 newsgroups") | |
print("=============") | |
print("X_train.shape = {0}".format(X_train.shape)) | |
print("X_train.format = {0}".format(X_train.format)) | |
print("X_train.dtype = {0}".format(X_train.dtype)) | |
print("X_train density = {0}" | |
"".format(X_train.nnz / np.product(X_train.shape))) | |
print("y_train {0}".format(y_train.shape)) | |
print("X_test {0}".format(X_test.shape)) | |
print("X_test.format = {0}".format(X_test.format)) | |
print("X_test.dtype = {0}".format(X_test.dtype)) | |
print("y_test {0}".format(y_test.shape)) | |
print() | |
print("Classifier Training") | |
print("===================") | |
accuracy, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {} | |
if 'all' in args['estimators']: | |
args['estimators'] = ESTIMATORS.keys() | |
for name in sorted(args["estimators"]): | |
clf = ESTIMATORS[name] | |
try: | |
clf.set_params(random_state=0) | |
except (TypeError, ValueError): | |
pass | |
print("Training %s ... " % name, end="") | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time[name] = time() - t0 | |
t0 = time() | |
y_pred = clf.predict(X_test) | |
test_time[name] = time() - t0 | |
accuracy[name] = accuracy_score(y_test, y_pred) | |
loss_curve[name] = clf.loss_curve_ | |
val_curve[name] = getattr(clf, 'validation_scores_', []) | |
print("done") | |
print() | |
print("Classification performance:") | |
print("===========================") | |
print() | |
print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", | |
"Accuracy")) | |
print("-" * 67) | |
for name in sorted(accuracy, key=accuracy.get): | |
print("%s %s %s %s" % (name.ljust(36), | |
("%.4fs" % train_time[name]).center(10), | |
("%.4fs" % test_time[name]).center(10), | |
("%.4f" % accuracy[name]).center(10))) | |
print() | |
with open('loss_history_20news.pkl', 'wb') as f: | |
pickle.dump(loss_curve, f) | |
with open('val_loss_history_20news.pkl', 'wb') as f: | |
pickle.dump(val_curve, f) | |
make_plots(loss_curve, val_curve) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmarking MLP performance on MNIST dataset | |
Classification performance: | |
=========================== | |
Classifier train-time test-time error-rate | |
---------------------------------------------------------------------------- | |
MLP_SGD_constant_momentum 105.07s 0.10s 0.0205 | |
MLP_SGD_adaptive_nesterov 166.20s 0.10s 0.0213 | |
MLP_SGD_constant_nesterov 123.11s 0.11s 0.0219 | |
MLP_Adam 49.43s 0.26s 0.0224 | |
MLP_SGD_constant_no_momentum 532.17s 0.11s 0.0231 | |
MLP_Adam_early 19.61s 0.12s 0.0241 | |
MLP_SGD_adaptive_nesterov_early 57.51s 0.11s 0.0251 | |
MLP_SGD_constant_nesterov_early 29.66s 0.11s 0.0283 | |
MLP_SGD_constant_no_momentum_early 95.10s 0.11s 0.0388 | |
MLP_SGD_invscaling_nesterov 46.28s 0.14s 0.0785 | |
MLP_SGD_invscaling_nesterov_early 17.27s 0.12s 0.0817 | |
with learning_rate_init=0.1 for sgd: | |
Classification performance: | |
=========================== | |
Classifier train-time test-time error-rate | |
---------------------------------------------------------------------------- | |
MLP_SGD_constant_momentum 37.69s 0.13s 0.0170 | |
MLP_SGD_constant_nesterov 48.36s 0.13s 0.0171 | |
MLP_SGD_adaptive_nesterov 91.48s 0.13s 0.0171 | |
MLP_SGD_adaptive_nesterov_early 57.50s 0.12s 0.0197 | |
MLP_SGD_constant_no_momentum 112.04s 0.13s 0.0204 | |
MLP_SGD_constant_nesterov_early 19.50s 0.14s 0.0213 | |
MLP_Adam 55.58s 0.14s 0.0224 | |
MLP_SGD_constant_no_momentum_early 39.07s 0.13s 0.0229 | |
MLP_Adam_early 22.73s 0.13s 0.0241 | |
MLP_SGD_invscaling_nesterov 107.90s 0.15s 0.0304 | |
MLP_SGD_invscaling_nesterov_early 44.23s 0.16s 0.0345 | |
""" | |
from __future__ import print_function | |
import os | |
import cPickle as pickle | |
from time import time | |
import argparse | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.datasets import fetch_mldata | |
from sklearn.datasets import get_data_home | |
from sklearn.externals.joblib import Memory | |
from sklearn.metrics import zero_one_loss | |
from sklearn.utils import check_array | |
from sklearn.neural_network import MLPClassifier | |
# Memoize the data extraction and memory map the resulting | |
# train / test splits in readonly mode | |
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), | |
mmap_mode='r') | |
@memory.cache | |
def load_data(dtype=np.float32, order='F'): | |
"""Load the data, then cache and memmap the train/test split""" | |
###################################################################### | |
# Load dataset | |
print("Loading dataset...") | |
data = fetch_mldata('MNIST original') | |
X = check_array(data['data'], dtype=dtype, order=order) | |
y = data["target"] | |
# Normalize features | |
X = X / 255 | |
# Create train-test split (as [Joachims, 2006]) | |
print("Creating train-test split...") | |
n_train = 60000 | |
X_train = X[:n_train] | |
y_train = y[:n_train] | |
X_test = X[n_train:] | |
y_test = y[n_train:] | |
return X_train, X_test, y_train, y_test | |
def make_plots(loss, val_loss): | |
non_early = [name for name in loss if not name.endswith('_early')] | |
early = [name for name in loss if name.endswith('_early')] | |
fig, axes = plt.subplots(2, 3, figsize=(15, 10)) | |
# not including MLP_SGD_constant_no_momentum because the number of | |
# iterations is too large | |
make_sub_plot({name: loss[name] for name in non_early | |
if name != 'MLP_SGD_constant_no_momentum'}, axes.ravel()[0]) | |
for name, ax in zip(early, axes.ravel()[1:]): | |
make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name], | |
name + '_val': val_loss[name]}, ax) | |
plt.subplots_adjust(hspace=0.45) | |
plt.subplots_adjust(top=0.8) | |
plt.show() | |
def make_sub_plot(loss, ax): | |
plot_args = [{'c': 'red', 'linestyle': '-'}, | |
{'c': 'green', 'linestyle': '-'}, | |
{'c': 'blue', 'linestyle': '-'}, | |
{'c': 'red', 'linestyle': '--'}, | |
{'c': 'green', 'linestyle': '--'}, | |
{'c': 'blue', 'linestyle': '--'}] | |
for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args): | |
ax.plot(loss_curve, label=label, **args) | |
if len(loss) > 3: | |
ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right', | |
bbox_to_anchor=(0.95, 1.30), fontsize=11) | |
else: | |
ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right', | |
bbox_to_anchor=(1.05, 1.20), fontsize=11) | |
ESTIMATORS = { | |
'MLP_SGD_constant_no_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'MLP_SGD_constant_no_momentum_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False, | |
early_stopping=True), | |
'MLP_SGD_constant_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'MLP_SGD_constant_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1), | |
'MLP_SGD_constant_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
early_stopping=True), | |
'MLP_SGD_invscaling_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling'), | |
'MLP_SGD_invscaling_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling', early_stopping=True), | |
'MLP_SGD_adaptive_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive'), | |
'MLP_SGD_adaptive_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive', early_stopping=True), | |
'MLP_Adam': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1), | |
'MLP_Adam_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1, early_stopping=True), | |
} | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--classifiers', nargs="+", | |
choices=ESTIMATORS.keys() + ['all'], type=str, | |
default=['MLP_SGD_constant_no_momentum', | |
'MLP_SGD_constant_momentum', | |
'MLP_SGD_constant_nesterov', | |
'MLP_Adam', 'MLP_Adam_early', | |
'MLP_SGD_constant_nesterov_early'], | |
help="list of classifiers to benchmark.") | |
parser.add_argument('--n-jobs', nargs="?", default=1, type=int, | |
help="Number of concurrently running workers for " | |
"models that support parallelism.") | |
parser.add_argument('--order', nargs="?", default="C", type=str, | |
choices=["F", "C"], | |
help="Allow to choose between fortran and C ordered " | |
"data") | |
parser.add_argument('--random-seed', nargs="?", default=0, type=int, | |
help="Common seed used by random number generator.") | |
args = vars(parser.parse_args()) | |
print(__doc__) | |
X_train, X_test, y_train, y_test = load_data(order=args["order"]) | |
print("") | |
print("Dataset statistics:") | |
print("===================") | |
print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) | |
print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) | |
print("%s %s" % ("data type:".ljust(25), X_train.dtype)) | |
print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), | |
X_train.shape[0], int(X_train.nbytes / 1e6))) | |
print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), | |
X_test.shape[0], int(X_test.nbytes / 1e6))) | |
print() | |
print("Training Classifiers") | |
print("====================") | |
error, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {} | |
if 'all' in args['classifiers']: | |
args['classifiers'] = ESTIMATORS.keys() | |
for name in sorted(args["classifiers"]): | |
print("Training %s ... " % name, end="") | |
estimator = ESTIMATORS[name] | |
estimator_params = estimator.get_params() | |
estimator.set_params(**{p: args["random_seed"] | |
for p in estimator_params | |
if p.endswith("random_state")}) | |
if "n_jobs" in estimator_params: | |
estimator.set_params(n_jobs=args["n_jobs"]) | |
time_start = time() | |
estimator.fit(X_train, y_train) | |
train_time[name] = time() - time_start | |
time_start = time() | |
y_pred = estimator.predict(X_test) | |
test_time[name] = time() - time_start | |
error[name] = zero_one_loss(y_test, y_pred) | |
loss_curve[name] = estimator.loss_curve_ | |
val_curve[name] = getattr(estimator, 'validation_scores_', []) | |
print("done") | |
print() | |
print("Classification performance:") | |
print("===========================") | |
print("{0: <39} {1: >10} {2: >11} {3: >12}" | |
"".format("Classifier ", "train-time", "test-time", "error-rate")) | |
print("-" * 76) | |
for name in sorted(args["classifiers"], key=error.get): | |
print("{0: <40} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}" | |
"".format(name, train_time[name], test_time[name], error[name])) | |
print() | |
with open('loss_history_mnist.pkl', 'wb') as f: | |
pickle.dump(loss_curve, f) | |
with open('val_loss_history_mnist.pkl', 'wb') as f: | |
pickle.dump(val_curve, f) | |
make_plots(loss_curve, val_curve) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from matplotlib.colors import ListedColormap | |
from sklearn.cross_validation import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.datasets import make_moons, make_circles, make_classification | |
from sklearn.neural_network import MLPClassifier | |
h = .02 # step size in the mesh | |
ESTIMATORS = { | |
'SGD_constant_no_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'SGD_constant_no_momentum_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False, | |
early_stopping=True), | |
'SGD_constant_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'SGD_constant_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1), | |
'SGD_constant_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
early_stopping=True), | |
'SGD_invscaling_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling'), | |
'SGD_invscaling_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling', early_stopping=True), | |
'SGD_adaptive_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive'), | |
'SGD_adaptive_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive', early_stopping=True), | |
'Adam': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1), | |
'Adam_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1, early_stopping=True), | |
} | |
names = ESTIMATORS.keys() | |
classifiers = ESTIMATORS.values() | |
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, | |
random_state=1, n_clusters_per_class=1) | |
rng = np.random.RandomState(2) | |
X += 2 * rng.uniform(size=X.shape) | |
linearly_separable = (X, y) | |
datasets = [make_moons(noise=0.3, random_state=0), | |
make_circles(noise=0.2, factor=0.5, random_state=1), | |
linearly_separable | |
] | |
figure = plt.figure(figsize=(27, 9)) | |
i = 1 | |
# iterate over datasets | |
for ds in datasets: | |
# preprocess dataset, split into training and test part | |
X, y = ds | |
X = StandardScaler().fit_transform(X) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) | |
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 | |
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), | |
np.arange(y_min, y_max, h)) | |
# just plot the dataset first | |
cm = plt.cm.RdBu | |
cm_bright = ListedColormap(['#FF0000', '#0000FF']) | |
ax = plt.subplot(len(datasets), len(classifiers) + 1, i) | |
# Plot the training points | |
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) | |
# and testing points | |
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) | |
ax.set_xlim(xx.min(), xx.max()) | |
ax.set_ylim(yy.min(), yy.max()) | |
ax.set_xticks(()) | |
ax.set_yticks(()) | |
i += 1 | |
# iterate over classifiers | |
cnt = 0 | |
for name, clf in zip(names, classifiers): | |
cnt += 1 | |
ax = plt.subplot(len(datasets), len(classifiers) + 1, i) | |
clf.fit(X_train, y_train) | |
score = clf.score(X_test, y_test) | |
# Plot the decision boundary. For that, we will assign a color to each | |
# point in the mesh [x_min, m_max]x[y_min, y_max]. | |
if hasattr(clf, "decision_function"): | |
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) | |
else: | |
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] | |
# Put the result into a color plot | |
Z = Z.reshape(xx.shape) | |
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) | |
# Plot also the training points | |
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) | |
# and testing points | |
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, | |
alpha=0.6) | |
ax.set_xlim(xx.min(), xx.max()) | |
ax.set_ylim(yy.min(), yy.max()) | |
ax.set_xticks(()) | |
ax.set_yticks(()) | |
if cnt % 2 == 0: | |
ax.set_title(name, fontsize=10, y=1.08) | |
else: | |
ax.set_title(name, fontsize=10) | |
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), | |
size=15, horizontalalignment='right') | |
i += 1 | |
figure.subplots_adjust(left=.02, right=.98) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Classification performance: | |
=========================== | |
Classifier train-time test-time Hamming Loss | |
------------------------------------------------------------------- | |
MLP_SGD_constant_no_momentum 5535.1194s 13.0709s 0.0136 | |
MLP_SGD_adaptive_nesterov_early 1295.6843s 11.7958s 0.0139 | |
MLP_SGD_constant_nesterov_early 860.3760s 11.6086s 0.0139 | |
MLP_Adam_early 570.8964s 11.9987s 0.0141 | |
MLP_SGD_adaptive_nesterov 5878.9259s 12.5683s 0.0144 | |
MLP_Adam 2589.0170s 14.1955s 0.0145 | |
MLP_SGD_constant_momentum 2408.1225s 11.9756s 0.0145 | |
MLP_SGD_constant_nesterov 4265.0237s 11.8534s 0.0145 | |
MLP_SGD_invscaling_nesterov 751.3171s 11.2348s 0.0315 | |
MLP_SGD_constant_no_momentum_early 50.1751s 11.1671s 0.0320 | |
MLP_SGD_invscaling_nesterov_early 70.1446s 11.3409s 0.0320 | |
""" | |
from __future__ import print_function, division | |
from time import time | |
import cPickle as pickle | |
import argparse | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.datasets import fetch_rcv1 | |
from sklearn.metrics import hamming_loss | |
from sklearn.utils.validation import check_array | |
from sklearn.neural_network import MLPClassifier | |
def make_plots(loss, val_loss): | |
non_early = [name for name in loss if not name.endswith('_early')] | |
early = [name for name in loss if name.endswith('_early')] | |
fig, axes = plt.subplots(2, 3, figsize=(15, 10)) | |
make_sub_plot({name: loss[name] for name in non_early}, axes.ravel()[0]) | |
for name, ax in zip(early, axes.ravel()[1:]): | |
make_sub_plot({name[:-6]: loss[name[:-6]], name: loss[name], | |
name + '_val': val_loss[name]}, ax) | |
plt.subplots_adjust(hspace=0.45) | |
plt.subplots_adjust(top=0.8) | |
plt.show() | |
def make_sub_plot(loss, ax): | |
plot_args = [{'c': 'red', 'linestyle': '-'}, | |
{'c': 'green', 'linestyle': '-'}, | |
{'c': 'blue', 'linestyle': '-'}, | |
{'c': 'red', 'linestyle': '--'}, | |
{'c': 'green', 'linestyle': '--'}, | |
{'c': 'blue', 'linestyle': '--'}] | |
for label, loss_curve, args in zip(loss.keys(), loss.values(), plot_args): | |
ax.plot(loss_curve, label=label, **args) | |
if len(loss) > 3: | |
ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right', | |
bbox_to_anchor=(0.95, 1.30), fontsize=11) | |
else: | |
ax.legend(ax.get_lines(), labels=loss.keys(), loc='center right', | |
bbox_to_anchor=(1.05, 1.20), fontsize=11) | |
ESTIMATORS = { | |
'MLP_SGD_constant_no_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'MLP_SGD_constant_no_momentum_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False, | |
early_stopping=True), | |
'MLP_SGD_constant_momentum': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, verbose=1, | |
tol=1e-4, random_state=1, nesterovs_momentum=False), | |
'MLP_SGD_constant_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1), | |
'MLP_SGD_constant_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
early_stopping=True), | |
'MLP_SGD_invscaling_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling'), | |
'MLP_SGD_invscaling_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='invscaling', early_stopping=True), | |
'MLP_SGD_adaptive_nesterov': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive'), | |
'MLP_SGD_adaptive_nesterov_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='sgd', learning_rate_init=0.01, momentum=0.9, | |
nesterovs_momentum=True, verbose=1, tol=1e-4, random_state=1, | |
learning_rate='adaptive', early_stopping=True), | |
'MLP_Adam': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1), | |
'MLP_Adam_early': MLPClassifier( | |
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, | |
algorithm='adam', learning_rate_init=0.001, verbose=1, | |
tol=1e-4, random_state=1, early_stopping=True), | |
} | |
############################################################################### | |
# Data | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-e', '--estimators', nargs="+", required=True, | |
choices=ESTIMATORS.keys() + ['all']) | |
args = vars(parser.parse_args()) | |
data_train = fetch_rcv1(subset="train", shuffle=True, random_state=1) | |
data_test = fetch_rcv1(subset="test", shuffle=True, random_state=1) | |
X_train = check_array(data_train.data, dtype=np.float32, | |
accept_sparse="csr") | |
X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") | |
y_train = data_train.target | |
y_test = data_test.target | |
print("rcv1") | |
print("=============") | |
print("X_train.shape = {0}".format(X_train.shape)) | |
print("X_train.format = {0}".format(X_train.format)) | |
print("X_train.dtype = {0}".format(X_train.dtype)) | |
print("X_train density = {0}" | |
"".format(X_train.nnz / np.product(X_train.shape))) | |
print("y_train {0}".format(y_train.shape)) | |
print("X_test {0}".format(X_test.shape)) | |
print("X_test.format = {0}".format(X_test.format)) | |
print("X_test.dtype = {0}".format(X_test.dtype)) | |
print("y_test {0}".format(y_test.shape)) | |
print() | |
print("Classifier Training") | |
print("===================") | |
if 'all' in args['estimators']: | |
args['estimators'] = ESTIMATORS.keys() | |
hmg_loss, train_time, test_time, loss_curve, val_curve = {}, {}, {}, {}, {} | |
for name in sorted(args["estimators"]): | |
clf = ESTIMATORS[name] | |
try: | |
clf.set_params(random_state=0) | |
except (TypeError, ValueError): | |
pass | |
print("Training %s ... " % name, end="") | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time[name] = time() - t0 | |
t0 = time() | |
y_pred = clf.predict(X_test) | |
test_time[name] = time() - t0 | |
hmg_loss[name] = hamming_loss(y_test, y_pred) | |
loss_curve[name] = clf.loss_curve_ | |
val_curve[name] = getattr(clf, 'validation_scores_', []) | |
print("done") | |
print() | |
print("Classification performance:") | |
print("===========================") | |
print() | |
print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", | |
"Hamming Loss")) | |
print("-" * 67) | |
for name in sorted(hmg_loss, key=hmg_loss.get): | |
print("%s %s %s %s" % (name.ljust(36), | |
("%.4fs" % train_time[name]).center(10), | |
("%.4fs" % test_time[name]).center(10), | |
("%.4f" % hmg_loss[name]).center(10))) | |
print() | |
with open('loss_history_rcv1.pkl', 'wb') as f: | |
pickle.dump(loss_curve, f) | |
with open('val_loss_history_rcv1.pkl', 'wb') as f: | |
pickle.dump(val_curve, f) | |
make_plots(loss_curve, val_curve) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
mean std | |
Sparse: 0.0586632259687 0.00355379776739 | |
np.dot: 0.0561765789986 0.00206648457631 | |
dummy: 0.0553110162417 0.00247213620297 | |
""" | |
from __future__ import print_function | |
import sys | |
from scipy.sparse import issparse | |
from sklearn.utils.extmath import safe_sparse_dot | |
import numpy as np | |
import time | |
def dummy_dot(a, b): | |
if issparse(a) or issparse(b): | |
raise ValueError | |
else: | |
return np.dot(a, b) | |
def compare(): | |
tests = [(np.random.rand(1000,10000), np.random.rand(10000)) for i in range(10)] | |
start = time.time() | |
for a, b in tests: | |
safe_sparse_dot(a, b) | |
elapsed_sparse = time.time() - start | |
start = time.time() | |
for a, b in tests: | |
np.dot(a, b) | |
elapsed_npdot = time.time() - start | |
start = time.time() | |
for a, b in tests: | |
dummy_dot(a, b) | |
elapsed_dummy = time.time() - start | |
return elapsed_sparse, elapsed_npdot, elapsed_dummy | |
def main(): | |
times = [] | |
n = 300 | |
for i in range(n): | |
times.append(compare()) | |
sys.stdout.write('\rFinished {} out of {}'.format(i+1, n)) | |
sys.stdout.flush() | |
times_sparse, times_npdot, times_dummy = map(np.array, zip(*times)) | |
avg_sparse, std_sparse = times_sparse.mean(), times_sparse.std() | |
avg_npdot, std_npdot = times_npdot.mean(), times_npdot.std() | |
avg_dummy, std_dummy = times_dummy.mean(), times_dummy.std() | |
print() | |
print(" mean std") | |
print("Sparse:", avg_sparse, std_sparse) | |
print("np.dot:", avg_npdot, std_npdot) | |
print("dummy: ", avg_dummy, std_dummy) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment