Skip to content

Instantly share code, notes, and snippets.

@dataders
Last active February 11, 2021 16:29
Show Gist options
  • Save dataders/afa78a32a327e6e562d1821313819cc2 to your computer and use it in GitHub Desktop.
Save dataders/afa78a32a327e6e562d1821313819cc2 to your computer and use it in GitHub Desktop.
fetching best model from HyperDriveStep
import argparse
import os
from pprint import pprint
import pandas as pd
from azureml.core import Run
def df2csv(df, dir, filename, **kwargs):
path = os.path.join(dir, filename)
print("saving {} to {}".format(filename, dir))
df.to_csv(path, index=False, **kwargs)
return df
def main(input_json):
df = (input_json
# convert columns to floats from single-item lists
.transform(lambda x: x.apply(lambda y: y[0]))
.reset_index()
.rename(columns={
"index": "run_id",
"geometric mean": "geometric_mean"})
)
print(df.head())
best_run = df.query('geometric_mean == geometric_mean.max()')
best_run_id = best_run['run_id'].values[0]
top3_runs = df.nlargest(3, 'geometric_mean')['geometric_mean'].mean()
return df, best_run, best_run_id, top3_runs
if __name__ == "__main__":
# parameters
parser = argparse.ArgumentParser()
parser.add_argument('--input_file', dest="input_file",
default="outputs/hyperdrive_json")
parser.add_argument('--output_dir', dest="output_dir",
default="outputs")
parser.add_argument('--exp_name', dest="exp_name",
default="ret-reproduceerrormsgs")
args = parser.parse_args()
print("all args: ")
pprint(args)
# INPUTS
cwd = os.getcwd()
print("cwd:", cwd)
print("dir of cwd", os.listdir(cwd))
parent = os.path.dirname(args.input_file)
print("input_dir_parent:", parent)
print("dir of input_dir_parent:", os.listdir(parent))
print("input file:", args.input_file)
input_json = pd.read_json(args.input_file, orient='index')
# MAIN
df, best_run, best_run_id, top3_runs = main(input_json)
# OUTPUTS
os.makedirs(args.output_dir, exist_ok=True)
df2csv(df, args.output_dir, "hyperdrive_metrics.csv")
df2csv(df, "./outputs", "hyperdrive_metrics.csv")
run = Run.get_context()
run.log("run_id", best_run_id)
run.log("geometric_mean", best_run['geometric_mean'].values[0])
run.log("f1", best_run['f1'].values[0])
run.log("geo_mean (top 3 avg)", top3_runs)
# download model of best run
exp = run.experiment
run_best = Run(experiment=exp, run_id=best_run_id)
run_best.download_file('outputs/Attrition.pkl', output_file_path="./outputs")
run_best.download_file('outputs/Attrition.pkl', output_file_path=args.output_dir)
split_data = PipelineData('split_data', datastore=ds_pipeline)
hyperdrive_json = PipelineData('hyperdrive_json', is_directory=False, datastore=ds_pipeline)
best_run_data = PipelineData('best_run_data', is_directory=True, datastore=ds_pipeline)
hyperdrive_step = HyperDriveStep(
name='kickoff hyperdrive jobs',
hyperdrive_config=hyperdrive_run_config,
estimator_entry_script_arguments=["--input_dir", split_data],
inputs=[split_data],
metrics_output=hyperdrive_json,
allow_reuse=pipeline_reuse
)
best_run_step = PythonScriptStep(
name='get best run',
script_name='get_metrics.py',
arguments=['--input_file', hyperdrive_json,
'--output_dir', best_run_data,
'--exp_name', project_name
],
compute_target=compute_target,
inputs=[hyperdrive_json],
outputs=[best_run_data],
runconfig=run_config,
source_directory=os.path.join(os.getcwd(), 'compute', 'metrics'),
allow_reuse=pipeline_reuse
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment