Skip to content

Instantly share code, notes, and snippets.

@bkj
Last active March 19, 2019 23:43
Show Gist options
  • Save bkj/0f2c4154d1eb91477e1d11b67bb6b3df to your computer and use it in GitHub Desktop.
Save bkj/0f2c4154d1eb91477e1d11b67bb6b3df to your computer and use it in GitHub Desktop.
"""
openml_rerf_test.py
"""
import sys
import openml
import argparse
import numpy as np
import pandas as pd
import sklearn
from sklearn import compose, impute, feature_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from RerF import fastRerF, fastPredict
# --
# CLI
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--task-id', type=int, default=3)
parser.add_argument('--num-cores', type=int, default=16)
parser.add_argument('--num-trees', type=int, default=500)
parser.add_argument('--seed', type=int, default=123)
return parser.parse_args()
args = parse_args()
np.random.seed(args.seed)
# --
# Load dataset
task = openml.tasks.get_task(args.task_id)
X, y = task.get_X_and_y()
# Use first split (for now)
train_idx, test_idx = task.get_train_test_split_indices()
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# --
# Preprocess data
dataset = task.get_dataset()
nominal_indices = dataset.get_features_by_type(data_type='nominal', exclude=[task.target_name])
numeric_indices = dataset.get_features_by_type(data_type='numeric', exclude=[task.target_name])
prep = sklearn.pipeline.make_pipeline(
sklearn.compose.ColumnTransformer(
transformers=[
('numeric', sklearn.pipeline.make_pipeline(
sklearn.preprocessing.Imputer(),
sklearn.preprocessing.StandardScaler(),
), numeric_indices),
('nominal', sklearn.pipeline.make_pipeline(
sklearn.impute.SimpleImputer(strategy='constant', fill_value=-1),
sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'),
), nominal_indices)
],
remainder='passthrough',
),
sklearn.feature_selection.VarianceThreshold(),
)
Xf_train = prep.fit_transform(X_train)
Xf_test = prep.transform(X_test)
# --
# Train models
def fit_rerf(Xf_train, Xf_test, y_train, y_test, num_trees, num_cores):
rerf_forest = fastRerF(
X=Xf_train,
Y=y_train,
forestType="binnedBaseRerF",
trees=num_trees,
numCores=num_cores,
)
return fastPredict(X=Xf_test, forest=rerf_forest)
def fit_sklearn(Xf_train, Xf_test, y_train, y_test, num_trees, num_cores):
sk_forest = RandomForestClassifier(n_estimators=num_trees, n_jobs=num_cores)
sk_forest = sk_forest.fit(Xf_train, y_train)
return sk_forest.predict(Xf_test)
kwargs = {
"Xf_train" : Xf_train,
"Xf_test" : Xf_test,
"y_train" : y_train,
"y_test" : y_test,
"num_trees" : args.num_trees,
"num_cores" : args.num_cores
}
print('-' * 50, file=sys.stderr)
print('fit rerf', file=sys.stderr)
rerf_pred = [fit_rerf(**kwargs) for _ in range(10)]
rerf_accs = [(y_test == p).mean() for p in rerf_pred]
print('-' * 50, file=sys.stderr)
print('fit sklearn', file=sys.stderr)
sk_pred = [fit_sklearn(**kwargs) for _ in range(10)]
sk_accs = [(y_test == p).mean() for p in sk_pred]
print('np.mean(rerf_accs)', np.mean(rerf_accs))
print('np.mean(sk_accs)', np.mean(sk_accs))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment