Last active
March 19, 2019 23:43
-
-
Save bkj/0f2c4154d1eb91477e1d11b67bb6b3df to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
openml_rerf_test.py | |
""" | |
import sys | |
import openml | |
import argparse | |
import numpy as np | |
import pandas as pd | |
import sklearn | |
from sklearn import compose, impute, feature_selection | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from RerF import fastRerF, fastPredict | |
# -- | |
# CLI | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--task-id', type=int, default=3) | |
parser.add_argument('--num-cores', type=int, default=16) | |
parser.add_argument('--num-trees', type=int, default=500) | |
parser.add_argument('--seed', type=int, default=123) | |
return parser.parse_args() | |
args = parse_args() | |
np.random.seed(args.seed) | |
# -- | |
# Load dataset | |
task = openml.tasks.get_task(args.task_id) | |
X, y = task.get_X_and_y() | |
# Use first split (for now) | |
train_idx, test_idx = task.get_train_test_split_indices() | |
X_train, X_test = X[train_idx], X[test_idx] | |
y_train, y_test = y[train_idx], y[test_idx] | |
# -- | |
# Preprocess data | |
dataset = task.get_dataset() | |
nominal_indices = dataset.get_features_by_type(data_type='nominal', exclude=[task.target_name]) | |
numeric_indices = dataset.get_features_by_type(data_type='numeric', exclude=[task.target_name]) | |
prep = sklearn.pipeline.make_pipeline( | |
sklearn.compose.ColumnTransformer( | |
transformers=[ | |
('numeric', sklearn.pipeline.make_pipeline( | |
sklearn.preprocessing.Imputer(), | |
sklearn.preprocessing.StandardScaler(), | |
), numeric_indices), | |
('nominal', sklearn.pipeline.make_pipeline( | |
sklearn.impute.SimpleImputer(strategy='constant', fill_value=-1), | |
sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'), | |
), nominal_indices) | |
], | |
remainder='passthrough', | |
), | |
sklearn.feature_selection.VarianceThreshold(), | |
) | |
Xf_train = prep.fit_transform(X_train) | |
Xf_test = prep.transform(X_test) | |
# -- | |
# Train models | |
def fit_rerf(Xf_train, Xf_test, y_train, y_test, num_trees, num_cores): | |
rerf_forest = fastRerF( | |
X=Xf_train, | |
Y=y_train, | |
forestType="binnedBaseRerF", | |
trees=num_trees, | |
numCores=num_cores, | |
) | |
return fastPredict(X=Xf_test, forest=rerf_forest) | |
def fit_sklearn(Xf_train, Xf_test, y_train, y_test, num_trees, num_cores): | |
sk_forest = RandomForestClassifier(n_estimators=num_trees, n_jobs=num_cores) | |
sk_forest = sk_forest.fit(Xf_train, y_train) | |
return sk_forest.predict(Xf_test) | |
kwargs = { | |
"Xf_train" : Xf_train, | |
"Xf_test" : Xf_test, | |
"y_train" : y_train, | |
"y_test" : y_test, | |
"num_trees" : args.num_trees, | |
"num_cores" : args.num_cores | |
} | |
print('-' * 50, file=sys.stderr) | |
print('fit rerf', file=sys.stderr) | |
rerf_pred = [fit_rerf(**kwargs) for _ in range(10)] | |
rerf_accs = [(y_test == p).mean() for p in rerf_pred] | |
print('-' * 50, file=sys.stderr) | |
print('fit sklearn', file=sys.stderr) | |
sk_pred = [fit_sklearn(**kwargs) for _ in range(10)] | |
sk_accs = [(y_test == p).mean() for p in sk_pred] | |
print('np.mean(rerf_accs)', np.mean(rerf_accs)) | |
print('np.mean(sk_accs)', np.mean(sk_accs)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment