Skip to content

Instantly share code, notes, and snippets.

@hvy
Created April 7, 2020 12:40
Show Gist options
  • Save hvy/59a7bd3507d4d05f95b46a610a1740c1 to your computer and use it in GitHub Desktop.
Save hvy/59a7bd3507d4d05f95b46a610a1740c1 to your computer and use it in GitHub Desktop.
Plots importances evaluated by fANOVA (AutoML.org) and Fanova (sklearn based implementation).
"""
Plots importances evaluated by fANOVA (AutoML.org) and Fanova (sklearn based implementation).
1. Loads data from csv containing parameter configuration-objective value pairs.
2. Loads search space definitions from csv.
3. Evaluates all single and pairwise importances between parameters.
4. Repeats evaluation N times (with different random seeds).
5. Plots the average over N evaluations.
"""
import argparse
from collections import defaultdict
from collections import OrderedDict
import itertools
import time
from typing import Dict
from ConfigSpace import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter
from ConfigSpace.hyperparameters import UniformFloatHyperparameter
from fanova import fANOVA
from fanova_sklearn.fanova import Fanova
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-Xy", type=str, help="File path to parameter-value csv.")
parser.add_argument("-s", type=str, help="File path to search space csv.")
parser.add_argument("-n", type=int, default=1, help="Number of repetitions to average over.")
parser.add_argument("--out", type=str, help="File path to output image.")
args = parser.parse_args()
# Read data from csv.
df = pd.read_csv(args.Xy)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
feature_columns = X.columns
X = np.asarray(X)
y = np.asarray(y)
print("Features", feature_columns)
# Read search space definitions from csv.
df = pd.read_csv(args.s)
search_spaces = df.iloc[:2, :]
search_spaces_is_categorical = df.iloc[2, :]
search_spaces = np.asarray(search_spaces).T
search_spaces_is_categorical = np.asarray(
search_spaces_is_categorical.astype(np.bool).tolist()
)
# fANOVA search spaces.
config_space = ConfigurationSpace()
for feature_name, ss, ss_is_categorical in zip(
feature_columns, search_spaces, search_spaces_is_categorical
):
if ss_is_categorical:
hp = CategoricalHyperparameter(feature_name, choices=list(range(int(ss[1]))))
else:
hp = UniformFloatHyperparameter(feature_name, lower=ss[0], upper=ss[1])
config_space.add_hyperparameter(hp)
# Take the mean over 10 different random forests since fits may vary.
seeds = list(range(args.n))
evaluator_importances = [defaultdict(list) for _ in range(2)]
evaluator_times = [[] for _ in range(2)]
for seed in seeds:
evaluators = []
evaluators.append(
Fanova(
n_estimators=32,
max_depth=64,
min_samples_split=2,
min_samples_leaf=1,
random_state=seed,
)
)
evaluators[-1].fit(
X=X,
y=y,
search_spaces=search_spaces,
search_spaces_is_categorical=search_spaces_is_categorical,
)
evaluators.append(
fANOVA(
X=X,
Y=y,
n_trees=32,
max_depth=64,
max_features=len(feature_columns),
min_samples_split=2,
min_samples_leaf=1,
seed=seed,
config_space=config_space,
)
)
for i, evaluator in enumerate(evaluators):
# Single and pairwise importances.
features = list(range(len(feature_columns)))
feature_tuples = []
for f in features:
feature_tuples.append((f,))
for f in itertools.product(features, features):
feature_tuples.append(f)
for feature_tuple in feature_tuples:
start = time.time()
imp = evaluator.quantify_importance(feature_tuple)
end = time.time()
duration = end - start
print(
"Finished {} with evaluator {} in {}s.".format(
feature_tuple, evaluator.__class__.__name__, duration
)
)
imp = imp[feature_tuple]["individual importance"]
evaluator_importances[i][feature_tuple].append(imp)
evaluator_times[i].append(duration)
for i in range(2):
print(
"{}: {}s.".format(
evaluators[i].__class__.__name__, np.array(evaluator_times[i]).mean()
)
)
data = {}
for evaluator, evaluator_importance in zip(evaluators, evaluator_importances):
mean_importances = {}
for feature_tuple, importances in evaluator_importance.items():
mean_importances[tuple(feature_columns[i] for i in feature_tuple)] = np.array(
importances
).mean()
data[evaluator.__class__.__name__] = mean_importances
save_multiple_importances(data, args.out, "Fanova Comparison")
def save_multiple_importances(
evaluator_importances: Dict[str, Dict[str, float]], filename: str, title: str = None
):
# Sort by evaluator names.
evaluator_importances = OrderedDict(sorted(evaluator_importances.items(), key=lambda x: x[0]))
index = []
for evaluator_name, importances in evaluator_importances.items():
for param_name in importances.keys():
if param_name not in index:
index.append(param_name)
data = defaultdict(list)
for evaluator_name, importances in evaluator_importances.items():
importance_values = list(importances.values())
importance_values = np.asarray(importance_values)
tot_importance = importance_values.sum()
for param_name in index:
data[evaluator_name].append(importances[param_name] / tot_importance)
df = pd.DataFrame(data, index=index)
# ax = df.plot.barh(figsize=(10, 140)) # ffmpeg
ax = df.plot.barh(figsize=(10, 40)) # pytorch, lightgbm
ax.set_title(title)
ax.set_xlabel("Importance")
ax.set_ylabel("Parameter")
plt.savefig(filename, bbox_inches="tight", dpi=100)
print("Saved {}.".format(filename))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment