Created
June 7, 2018 01:09
-
-
Save rayheberer/16cbbfead8aef036c6d0b9e3b980d405 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.datasets import load_iris | |
from sklearn.svm import SVC | |
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV | |
X, y = load_iris(return_X_y=True) | |
np.random.seed(41) | |
svc = SVC() | |
params1 = {'C': np.logspace(0,3, 10), 'gamma': np.linspace(0.0001, 0.001, 10)} | |
clf1 = GridSearchCV(svc, params1) | |
params2 = {'C': np.logspace(0, 3), 'gamma': np.linspace(0.0001, 0.001)} | |
clf2 = RandomizedSearchCV(svc, params2, n_iter=100) | |
fig, axs = plt.subplots(1, 2, figsize=(10, 6)) | |
fig.subplots_adjust(wspace=0.3) | |
plt.suptitle('Exploring SVM Hyperparameter Values: Grid vs Randomized Search') | |
fig.text(0.5, 0.04, 'C', ha='center') | |
fig.text(0.04, 0.5, 'gamma', va='center', rotation='vertical') | |
for clf, ax in zip([clf1, clf2], axs.ravel()): | |
clf.fit(X, y) | |
scores = clf.cv_results_['mean_test_score'] | |
C = clf.cv_results_['param_C'] | |
gamma = clf.cv_results_['param_gamma'] | |
plot = ax.scatter(C, gamma, c=scores, cmap='coolwarm') | |
ax.set(ylim=(0, 0.0011)) | |
ax.set_xscale('log') | |
cbar = plt.colorbar(plot, ax=ax) | |
cbar.set_label('Validation Accuracy', | |
rotation=270, | |
rotation_mode='default', | |
verticalalignment='center') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.datasets import make_moons | |
from sklearn.ensemble import RandomForestClassifier | |
np.random.seed(41) | |
X, y = make_moons(noise=0.1) | |
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 | |
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 | |
xx, yy = np.meshgrid(np.linspace(x_min, x_max), | |
np.linspace(y_min, y_max)) | |
fig, axs = plt.subplots(2, 2, figsize=(10, 6)) | |
for n, ax in zip([1, 2, 5, 10], axs.ravel()): | |
clf = RandomForestClassifier(n_estimators=n) | |
clf.fit(X, y) | |
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) | |
Z = Z.reshape(xx.shape) | |
ax.scatter(X[:, 0], X[:, 1], c=y, cmap='RdBu') | |
ax.contourf(xx, yy, Z, cmap='RdBu', alpha=0.7, corner_mask=False) | |
ax.set(title='n_estimators={}'.format(n)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.datasets import make_classification | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
np.random.seed(13) | |
X, y = make_classification(n_samples=4000) | |
def repeated_train_test_scores(test_size, trials=30): | |
scores = [] | |
for _ in range(trials): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) | |
clf = LogisticRegression() | |
clf.fit(X_train, y_train) | |
scores.append(clf.score(X_test, y_test)) | |
return scores | |
sizes = np.arange(20, 1001, 20) | |
spreads = [] | |
for test_size in sizes: | |
scores = repeated_train_test_scores(test_size) | |
spreads.append(np.std(scores)) | |
plt.plot(sizes, spreads, 'o') | |
plt.title('Standard Deviations of Validation Scores of Trained Models') | |
plt.xlabel('Validation Set Samples') | |
plt.ylabel('Standard Deviation of Validation Accuracy') | |
plt.show() | |
fig, axs = plt.subplots(2, 2) | |
fig.subplots_adjust(hspace=0.2) | |
sizes = [100, 500, 1000, 2000] | |
for ax, size in zip(axs.ravel(), sizes): | |
scores = repeated_train_test_scores(size, trials=50) | |
sns.distplot(scores, bins=20, ax=ax, hist_kws={'range': (0.83, 0.97)}) | |
ax.set(title='Validation Set: {} samples'.format(size)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment