Last active
March 24, 2021 21:33
-
-
Save DMTSource/2b38b473270a50e71025dd6cb1c03521 to your computer and use it in GitHub Desktop.
Modified version of the readme_long_example from baikal. Attempting to make it work for multiple inputs: https://github.com/alegonz/baikal/issues/50
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sklearn.decomposition | |
import sklearn.ensemble | |
import sklearn.linear_model | |
import sklearn.preprocessing | |
import sklearn.svm | |
from sklearn.datasets import load_breast_cancer | |
from sklearn.model_selection import train_test_split | |
from baikal import Input, Model, make_step | |
from baikal.plot import plot_model | |
from baikal.steps import Stack | |
### ADDED to readme_long_example | |
from sklearn.model_selection import GridSearchCV, StratifiedKFold | |
### | |
# 1. Define the steps | |
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) | |
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) | |
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) | |
PCA = make_step(sklearn.decomposition.PCA) | |
SVC = make_step(sklearn.svm.SVC) | |
PowerTransformer = make_step(sklearn.preprocessing.PowerTransformer) | |
# 2. Build the model | |
x1 = Input(name="x1") | |
x2 = Input(name="x2") | |
y_t = Input(name="y_t") | |
y1 = ExtraTreesClassifier()(x1, y_t) | |
y2 = RandomForestClassifier()(x2, y_t) | |
z = PowerTransformer()(x2) | |
z = PCA()(z) | |
y3 = LogisticRegression()(z, y_t) | |
stacked_features = Stack()([y1, y2, y3]) | |
y_p = SVC()(stacked_features, y_t) | |
model = Model([x1, x2], y_p, y_t) | |
plot_model(model, filename="multiple_input_nonlinear_pipeline_example_plot.png") | |
# 3. Train the model | |
dataset = load_breast_cancer() | |
X_train, X_test, y_train, y_test = train_test_split( | |
dataset.data, dataset.target, random_state=0 | |
) | |
# Let's suppose the dataset is originally split in two | |
X1_train, X2_train = X_train[:, :15], X_train[:, 15:] | |
X1_test, X2_test = X_test[:, :15], X_test[:, 15:] | |
### ADDED to readme_long_example | |
#model.fit([X1_train, X2_train], y_train) | |
param_grid = [ | |
{ | |
"LogisticRegression_0": [ | |
LogisticRegression( | |
random_state=0, solver="lbfgs", multi_class="multinomial" | |
) | |
], | |
"LogisticRegression_0__C": [0.01, 0.1, 1], | |
"PCA_0__n_components": [1, 2, 3, 4], | |
}, | |
{ | |
"RandomForestClassifier_0": [RandomForestClassifier(random_state=0)], | |
"RandomForestClassifier_0__n_estimators": [10, 50, 100], | |
}, | |
] | |
cv = StratifiedKFold(n_splits=3)#, random_state=0) #shuffle is false so no random | |
gscv_baikal = GridSearchCV( | |
model, | |
param_grid, | |
cv=cv, | |
scoring="accuracy", | |
return_train_score=True, | |
verbose=1, | |
) | |
# THIS WILL FAIL, it does NOT like the inputs? | |
# ValueError: Found input variables with inconsistent numbers of samples: [2, 426] | |
gscv_baikal.fit([X1_train, X2_train], y_train) | |
print("Best score:", gscv_baikal.best_score_) | |
print("Best parameters", gscv_baikal.best_params_) | |
model = gscv_baikal.best_estimator_.model | |
### | |
# 4. Use the model | |
y_test_pred = model.predict([X1_test, X2_test]) | |
# This also works: | |
# y_test_pred = model.predict({x1: X1_test, x2: X2_test}) | |
# We can also query any intermediate outputs: | |
outs = model.predict( | |
[X1_test, X2_test], output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"] | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment