Created
April 7, 2020 12:40
-
-
Save hvy/59a7bd3507d4d05f95b46a610a1740c1 to your computer and use it in GitHub Desktop.
Plots importances evaluated by fANOVA (AutoML.org) and Fanova (sklearn based implementation).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Plots importances evaluated by fANOVA (AutoML.org) and Fanova (sklearn based implementation). | |
1. Loads data from csv containing parameter configuration-objective value pairs. | |
2. Loads search space definitions from csv. | |
3. Evaluates all single and pairwise importances between parameters. | |
4. Repeats evaluation N times (with different random seeds). | |
5. Plots the average over N evaluations. | |
""" | |
import argparse | |
from collections import defaultdict | |
from collections import OrderedDict | |
import itertools | |
import time | |
from typing import Dict | |
from ConfigSpace import ConfigurationSpace | |
from ConfigSpace.hyperparameters import CategoricalHyperparameter | |
from ConfigSpace.hyperparameters import UniformFloatHyperparameter | |
from fanova import fANOVA | |
from fanova_sklearn.fanova import Fanova | |
from matplotlib import pyplot as plt | |
import numpy as np | |
import pandas as pd | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-Xy", type=str, help="File path to parameter-value csv.") | |
parser.add_argument("-s", type=str, help="File path to search space csv.") | |
parser.add_argument("-n", type=int, default=1, help="Number of repetitions to average over.") | |
parser.add_argument("--out", type=str, help="File path to output image.") | |
args = parser.parse_args() | |
# Read data from csv. | |
df = pd.read_csv(args.Xy) | |
X = df.iloc[:, :-1] | |
y = df.iloc[:, -1] | |
feature_columns = X.columns | |
X = np.asarray(X) | |
y = np.asarray(y) | |
print("Features", feature_columns) | |
# Read search space definitions from csv. | |
df = pd.read_csv(args.s) | |
search_spaces = df.iloc[:2, :] | |
search_spaces_is_categorical = df.iloc[2, :] | |
search_spaces = np.asarray(search_spaces).T | |
search_spaces_is_categorical = np.asarray( | |
search_spaces_is_categorical.astype(np.bool).tolist() | |
) | |
# fANOVA search spaces. | |
config_space = ConfigurationSpace() | |
for feature_name, ss, ss_is_categorical in zip( | |
feature_columns, search_spaces, search_spaces_is_categorical | |
): | |
if ss_is_categorical: | |
hp = CategoricalHyperparameter(feature_name, choices=list(range(int(ss[1])))) | |
else: | |
hp = UniformFloatHyperparameter(feature_name, lower=ss[0], upper=ss[1]) | |
config_space.add_hyperparameter(hp) | |
# Take the mean over 10 different random forests since fits may vary. | |
seeds = list(range(args.n)) | |
evaluator_importances = [defaultdict(list) for _ in range(2)] | |
evaluator_times = [[] for _ in range(2)] | |
for seed in seeds: | |
evaluators = [] | |
evaluators.append( | |
Fanova( | |
n_estimators=32, | |
max_depth=64, | |
min_samples_split=2, | |
min_samples_leaf=1, | |
random_state=seed, | |
) | |
) | |
evaluators[-1].fit( | |
X=X, | |
y=y, | |
search_spaces=search_spaces, | |
search_spaces_is_categorical=search_spaces_is_categorical, | |
) | |
evaluators.append( | |
fANOVA( | |
X=X, | |
Y=y, | |
n_trees=32, | |
max_depth=64, | |
max_features=len(feature_columns), | |
min_samples_split=2, | |
min_samples_leaf=1, | |
seed=seed, | |
config_space=config_space, | |
) | |
) | |
for i, evaluator in enumerate(evaluators): | |
# Single and pairwise importances. | |
features = list(range(len(feature_columns))) | |
feature_tuples = [] | |
for f in features: | |
feature_tuples.append((f,)) | |
for f in itertools.product(features, features): | |
feature_tuples.append(f) | |
for feature_tuple in feature_tuples: | |
start = time.time() | |
imp = evaluator.quantify_importance(feature_tuple) | |
end = time.time() | |
duration = end - start | |
print( | |
"Finished {} with evaluator {} in {}s.".format( | |
feature_tuple, evaluator.__class__.__name__, duration | |
) | |
) | |
imp = imp[feature_tuple]["individual importance"] | |
evaluator_importances[i][feature_tuple].append(imp) | |
evaluator_times[i].append(duration) | |
for i in range(2): | |
print( | |
"{}: {}s.".format( | |
evaluators[i].__class__.__name__, np.array(evaluator_times[i]).mean() | |
) | |
) | |
data = {} | |
for evaluator, evaluator_importance in zip(evaluators, evaluator_importances): | |
mean_importances = {} | |
for feature_tuple, importances in evaluator_importance.items(): | |
mean_importances[tuple(feature_columns[i] for i in feature_tuple)] = np.array( | |
importances | |
).mean() | |
data[evaluator.__class__.__name__] = mean_importances | |
save_multiple_importances(data, args.out, "Fanova Comparison") | |
def save_multiple_importances( | |
evaluator_importances: Dict[str, Dict[str, float]], filename: str, title: str = None | |
): | |
# Sort by evaluator names. | |
evaluator_importances = OrderedDict(sorted(evaluator_importances.items(), key=lambda x: x[0])) | |
index = [] | |
for evaluator_name, importances in evaluator_importances.items(): | |
for param_name in importances.keys(): | |
if param_name not in index: | |
index.append(param_name) | |
data = defaultdict(list) | |
for evaluator_name, importances in evaluator_importances.items(): | |
importance_values = list(importances.values()) | |
importance_values = np.asarray(importance_values) | |
tot_importance = importance_values.sum() | |
for param_name in index: | |
data[evaluator_name].append(importances[param_name] / tot_importance) | |
df = pd.DataFrame(data, index=index) | |
# ax = df.plot.barh(figsize=(10, 140)) # ffmpeg | |
ax = df.plot.barh(figsize=(10, 40)) # pytorch, lightgbm | |
ax.set_title(title) | |
ax.set_xlabel("Importance") | |
ax.set_ylabel("Parameter") | |
plt.savefig(filename, bbox_inches="tight", dpi=100) | |
print("Saved {}.".format(filename)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment