dataders · April 2, 2019 22:21
diff --git a/pipeline_run_id.txt b/pipeline_run_id.txt
 http:///pipelines/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiment/undefined/run/115f70d3-91f7-4052-bbb1-1aef0da3c2e3
diff --git a/stdoutlog.txt b/stdoutlog.txt
 [2019-04-01 23:06:54Z] Metrics for HyperDrive run:
 [2019-04-01 23:06:54Z] {}
 [2019-04-01 23:06:57Z] azureml-logs/hyperdrive.txt
 [2019-04-01 23:06:58Z] "<START>[2019-04-01T22:59:45.973424][API][INFO]Experiment created<END>\n"
 [2019-04-01 23:06:58Z] "<START>[2019-04-01T22:59:46.318831][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"
 [2019-04-01 23:06:58Z] "<START>[2019-04-01T22:59:46.413609][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2019-04-01T22:59:48.1588418Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2019-04-01T23:00:16.318398][GENERATOR][INFO]Max number of jobs '4' reached for experiment.<END>\n""<START>[2019-04-01T23:00:16.952672][GENERATOR][INFO]All jobs generated.<END>\n"<START>[2019-04-01T23:00:18.7742394Z][SCHEDULER][INFO]The execution environment was successfully prepared.<END><START>[2019-04-01T23:00:18.7744093Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_0'<END><START>[2019-04-01T23:00:18.7811517Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_2'<END><START>[2019-04-01T23:00:18.7824883Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_3'<END><START>[2019-04-01T23:00:18.7805792Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_1'<END><START>[2019-04-01T23:00:26.1672593Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_1'<END><START>[2019-04-01T23:00:26.5002758Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_0'<END><START>[2019-04-01T23:00:26.6051498Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_2'<END><START>[2019-04-01T23:00:28.1865188Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_3'<END>"<START>[2019-04-01T23:06:47.573411][CONTROLLER][INFO]Experiment was 'ExperimentStatus.RUNNING', is 'ExperimentStatus.FINISHED'.<END>\n"
diff --git a/zzz_hypnodrive.py b/zzz_hypnodrive.py
 #%% [markdown]
 #  # Deploy
 # 
 #  Use this notebook to deploy the latest model to a docker container.
 # 
 #  ## Pre-Requisites
 # 
 #  This notebook assumes that a model has been generated and is stored in the current directory as <project_name>.pkl.

 #%%
 from IPython.core.display import display, HTML
 display(HTML("<style>.container { width:100% !important; }</style>") )

 #%% [markdown]
 #  ## Setup development environment
 # 
 #  In order to run this notebook, you must first setup a Python virtual environment with the necessary packages and install the Azure ML SDK.  Refer to the following link for more information:
 #  - https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-environment#configure-jupyter-notebooks-on-your-own-computer
 # 

 #%%
 get_ipython().run_line_magic('matplotlib', 'inline')

 import os, sys
 sys.path.append(os.getcwd())

 import azureml
 import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd

 from azureml.core.authentication import InteractiveLoginAuthentication
 from azureml.core.compute import AmlCompute, ComputeTarget, DataFactoryCompute
 from azureml.core.datastore import Datastore
 from azureml.core.runconfig import CondaDependencies, RunConfiguration, DataReferenceConfiguration, DEFAULT_GPU_IMAGE
 from azureml.core import Experiment, Workspace, Run

 from azureml.data.data_reference import DataReference
 from azureml.exceptions import ComputeTargetException
 from azureml.pipeline.core import Pipeline, PublishedPipeline, PipelineData, OutputPortBinding
 from azureml.pipeline.steps import PythonScriptStep, DataTransferStep, EstimatorStep, HyperDriveStep
 from azureml.train.estimator import Estimator
 from azureml.train.hyperdrive import     HyperDriveRunConfig,     PrimaryMetricGoal,     uniform, quniform, choice,     RandomParameterSampling, BayesianParameterSampling,     MedianStoppingPolicy

 from azureml.widgets import RunDetails

 print("Azure ML SDK Version: ", azureml.core.VERSION)


 #%%
 project_name = 'attrition_pipe_anders'
 compute_target_name = 'dev-anders'
 data_factory_name = 'adf'


 #%%
 interactive_auth = InteractiveLoginAuthentication(tenant_id="cf36141c-ddd7-45a7-b073-111f66d0b30c")
 ws = Workspace.from_config(auth=interactive_auth)
 print("Found workspace {} at location {}".format(ws.name, ws.location))

 # create/get experiment
 exp = Experiment(workspace=ws, name=project_name)

 # Default datastore (Azure file storage)
 def_file_store = ws.get_default_datastore() 
 print("Default datastore's name: {}".format(def_file_store.name))

 def_blob_store = Datastore(ws, "workspaceblobstore")
 print("Blobstore's name: {}".format(def_blob_store.name))


 #%%
 # Verify that cluster does not exist already
 try:
    compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
    print('Found existing cluster, use it.')
 except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, compute_target_name, compute_config)

 compute_target.wait_for_completion(show_output=True)


 #%%
 def get_or_create_data_factory(workspace, factory_name):
    try:
        return DataFactoryCompute(workspace, factory_name)
    except ComputeTargetException as e:
        if 'ComputeTargetNotFound' in e.message:
            print('Data factory not found, creating...')
            provisioning_config = DataFactoryCompute.provisioning_configuration()
            data_factory = ComputeTarget.create(workspace, factory_name, provisioning_config)
            return data_factory
        else:
            raise e
            
 data_factory_compute = get_or_create_data_factory(ws, data_factory_name)

 print("setup data factory account complete")


 #%%
 batchai_run_config = RunConfiguration().load(path = './compute/', name = 'aml_compute')


 #%%
 # data references
 input_dir = DataReference(data_reference_name='input_data', datastore=def_blob_store, path_on_datastore='attrition_pipe')
 output_dir = DataReference(data_reference_name='output_data', datastore=def_blob_store, path_on_datastore='attrition_pipe/output')
 munged_hc = PipelineData('munged_hc', datastore=def_blob_store)
 munged_leaver = PipelineData('munged_leaver', datastore=def_blob_store)
 munged_roster = PipelineData('munged_roster', datastore=def_blob_store)
 munged_absence = PipelineData('munged_absence', datastore=def_blob_store)
 munged_productivity = PipelineData('munged_productivity', datastore=def_blob_store)
 munged_promo = PipelineData('munged_promo', datastore=def_blob_store)
 munged_time = PipelineData('munged_time', datastore=def_blob_store)
 munged_travel = PipelineData('munged_travel', datastore=def_blob_store)
 batch_scoring_data = PipelineData('batch_scoring', datastore=def_blob_store)
 hyperdrive_metrics = PipelineData('hyperdrive_metrics', datastore=def_blob_store)
 processed_data1 = PipelineData('processed_data1', datastore=def_blob_store)
 processed_data2 = PipelineData('processed_data2', datastore=def_blob_store)
 split_data = PipelineData('split_data', datastore=def_blob_store)
 tests_output = PipelineData('tests_output', datastore=def_blob_store)


 #%%
 munge_hc_step = PythonScriptStep(
    name='munge headcount',
    script_name='munge_headcount.py',
    arguments=['--input_dir', input_dir, 
               '--output_dir', munged_hc],
    compute_target=compute_target,
    inputs=[input_dir],
    outputs=[munged_hc],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_leaver_step = PythonScriptStep(
    name='munge leaver',
    script_name='munge_leaver.py',
    arguments=['--input_dir', input_dir,
               '--output_dir', munged_leaver],
    compute_target=compute_target,
    inputs=[input_dir],
    outputs=[munged_leaver],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_roster_step = PythonScriptStep(
    name='munge roster',
    script_name='munge_roster.py',
    arguments=['--input_dir', input_dir,
               '--output_dir', munged_roster],
    compute_target=compute_target,
    inputs=[input_dir],
    outputs=[munged_roster],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_absence_step = PythonScriptStep(
    name='munge absence',
    script_name='munge_absence.py',
    arguments=['--input_dir', input_dir,
               '--munged_hc_dir', munged_hc,
               '--output_dir', munged_absence],
    compute_target=compute_target,
    inputs=[input_dir, munged_hc],
    outputs=[munged_absence],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_productivity_step = PythonScriptStep(
    name='munge productivity',
    script_name='munge_productivity.py',
    arguments=['--input_dir', input_dir,
               '--munged_hc_dir', munged_hc,
               '--output_dir', munged_productivity],
    compute_target=compute_target,
    inputs=[input_dir, munged_hc],
    outputs=[munged_productivity],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_promo_step = PythonScriptStep(
    name='munge promo',
    script_name='munge_promo.py',
    arguments=['--input_dir', input_dir,
               '--munged_hc_dir', munged_hc,
               '--output_dir', munged_promo],
    compute_target=compute_target,
    inputs=[input_dir, munged_hc],
    outputs=[munged_promo],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_time_step = PythonScriptStep(
    name='munge time',
    script_name='munge_time.py',
    arguments=['--input_dir', input_dir,
               '--munged_hc_dir', munged_hc,
               '--output_dir', munged_time],
    compute_target=compute_target,
    inputs=[input_dir, munged_hc],
    outputs=[munged_time],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 munge_travel_step = PythonScriptStep(
    name='munge travel',
    script_name='munge_travel.py',
    arguments=['--input_dir', input_dir,
               '--munged_hc_dir', munged_hc,
               '--output_dir', munged_travel],
    compute_target=compute_target,
    inputs=[input_dir, munged_hc],
    outputs=[munged_travel],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 join_step = PythonScriptStep(
    name='join',
    script_name='join.py',
    arguments=['--munged_hc_dir', munged_hc,
               '--munged_roster_dir', munged_roster,
               '--munged_absence_dir', munged_absence,
               '--munged_leaver_dir', munged_leaver,
               '--munged_productivity_dir', munged_productivity,
               '--munged_promo_dir', munged_promo,
               '--munged_time_dir', munged_time,
               '--munged_travel_dir', munged_travel, 
               '--output_dir', processed_data2],
    compute_target=compute_target,
    inputs=[
        munged_hc,
        munged_roster, 
        munged_absence, 
        munged_leaver, 
        munged_productivity, 
        munged_promo,
        munged_time,
        munged_travel
    ],
    outputs=[processed_data2],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )


 split_step = PythonScriptStep(
    name='Split data',
    script_name='get_data.py',
    arguments=['--input_dir', processed_data2,
               '--output_dir', split_data],
    compute_target=compute_target,
    inputs=[processed_data2],
    outputs=[split_data],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )

 batch_scoring_step = PythonScriptStep(
    name='Batch scoring',
    script_name='batch_scoring.py',
    arguments=[
        '--context', 'remote',
        '--model_name', 'attrition3_pipe_test',
        '--dataset_path', split_data,
        '--output_dir', batch_scoring_data],
    compute_target=compute_target,
    inputs=[split_data],
    outputs=[batch_scoring_data],
    runconfig=batchai_run_config,
    source_directory=os.path.join(os.getcwd(), 'compute'),
    allow_reuse=True
 )


 #%%
 # hyperdrive config
 est_config_aml = Estimator(
    source_directory = "./compute",
    entry_script= "train.py",
    compute_target = compute_target,
    environment_definition = batchai_run_config.environment
 )

 random_sampling = RandomParameterSampling( {
        'boosting_type' : choice('gbdt', 'dart'),
        'learning_rate': quniform(0.05, 0.1, 0.01),
        'num_leaves': quniform(4, 20, 1),
        "max_bin": quniform(50,300, 5),
        "min_child_samples": quniform(20,200, 5)
 } )

 hyperdrive_run_config = HyperDriveRunConfig(
    estimator=est_config_aml, # AML
    hyperparameter_sampling=random_sampling,
    primary_metric_name="geometric mean", 
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=4,
    max_concurrent_runs=4)

 hyperdrive_step = HyperDriveStep(
    name='Hyperdrive',
    hyperdrive_run_config = hyperdrive_run_config,
    estimator_entry_script_arguments = [ "--input_dir", split_data, '--output_dir', hyperdrive_metrics],
    inputs=[split_data],
    metrics_output = hyperdrive_metrics,
    allow_reuse=True
 )


 #%%
 # save out intermediary files back to blob
 transfer_gold_step = DataTransferStep(
    name="transfer_gold",
    source_data_reference=processed_data2,
    destination_data_reference=output_dir,
    source_reference_type='directory',
    destination_reference_type='directory',
    compute_target=data_factory_compute,
    allow_reuse=True
 )

 transfer_output_step = DataTransferStep(
    name="transfer_output",
    source_data_reference=batch_scoring_data,
    destination_data_reference=output_dir,
    source_reference_type='directory',
    destination_reference_type='directory',
    compute_target=data_factory_compute,
    allow_reuse=True
 )

 transfer_metrics_step = DataTransferStep(
    name="transfer_metrics",
    source_data_reference=hyperdrive_metrics,
    destination_data_reference=output_dir,
    source_reference_type='directory',
    destination_reference_type='directory',
    compute_target=data_factory_compute,
    allow_reuse=True
 )


 #%%
 pipeline = Pipeline(workspace=ws,
    steps=[
 #         transfer_gold_step,
 #         transfer_output_step,
        transfer_metrics_step
          ])


 #%%
 pipeline_run = Experiment(ws, project_name).submit(pipeline, pipeline_params={})


 #%%
 RunDetails(pipeline_run).show()


 #%%
 # pipeline_run.cancel()


 #%%
 pipeline_run
	[2019-04-01 23:06:54Z] Metrics for HyperDrive run:
	[2019-04-01 23:06:54Z] {}
	[2019-04-01 23:06:57Z] azureml-logs/hyperdrive.txt
	[2019-04-01 23:06:58Z] "<START>[2019-04-01T22:59:45.973424][API][INFO]Experiment created<END>\n"
	[2019-04-01 23:06:58Z] "<START>[2019-04-01T22:59:46.318831][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n"
	[2019-04-01 23:06:58Z] "<START>[2019-04-01T22:59:46.413609][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2019-04-01T22:59:48.1588418Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2019-04-01T23:00:16.318398][GENERATOR][INFO]Max number of jobs '4' reached for experiment.<END>\n""<START>[2019-04-01T23:00:16.952672][GENERATOR][INFO]All jobs generated.<END>\n"<START>[2019-04-01T23:00:18.7742394Z][SCHEDULER][INFO]The execution environment was successfully prepared.<END><START>[2019-04-01T23:00:18.7744093Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_0'<END><START>[2019-04-01T23:00:18.7811517Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_2'<END><START>[2019-04-01T23:00:18.7824883Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_3'<END><START>[2019-04-01T23:00:18.7805792Z][SCHEDULER][INFO]Scheduling job, id='https://westus2.experiments.azureml.net/subscriptions/ff2e23ae-7d7c-4cbd-99b8-116bb94dca6e/resourceGroups/RG-ITSMLTeam-Dev/providers/Microsoft.MachineLearningServices/workspaces/avadevitsmlsvc/experiments/attrition_pipe_anders/runs/attrition_pipe_anders_1554159585383_1'<END><START>[2019-04-01T23:00:26.1672593Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_1'<END><START>[2019-04-01T23:00:26.5002758Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_0'<END><START>[2019-04-01T23:00:26.6051498Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_2'<END><START>[2019-04-01T23:00:28.1865188Z][SCHEDULER][INFO]Successfully scheduled a job. Id='attrition_pipe_anders_1554159585383_3'<END>"<START>[2019-04-01T23:06:47.573411][CONTROLLER][INFO]Experiment was 'ExperimentStatus.RUNNING', is 'ExperimentStatus.FINISHED'.<END>\n"
	#%% [markdown]
	# # Deploy
	#
	# Use this notebook to deploy the latest model to a docker container.
	#
	# ## Pre-Requisites
	#
	# This notebook assumes that a model has been generated and is stored in the current directory as <project_name>.pkl.

	#%%
	from IPython.core.display import display, HTML
	display(HTML("<style>.container { width:100% !important; }</style>") )

	#%% [markdown]
	# ## Setup development environment
	#
	# In order to run this notebook, you must first setup a Python virtual environment with the necessary packages and install the Azure ML SDK. Refer to the following link for more information:
	# - https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-environment#configure-jupyter-notebooks-on-your-own-computer
	#

	#%%
	get_ipython().run_line_magic('matplotlib', 'inline')

	import os, sys
	sys.path.append(os.getcwd())

	import azureml
	import matplotlib
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd

	from azureml.core.authentication import InteractiveLoginAuthentication
	from azureml.core.compute import AmlCompute, ComputeTarget, DataFactoryCompute
	from azureml.core.datastore import Datastore
	from azureml.core.runconfig import CondaDependencies, RunConfiguration, DataReferenceConfiguration, DEFAULT_GPU_IMAGE
	from azureml.core import Experiment, Workspace, Run

	from azureml.data.data_reference import DataReference
	from azureml.exceptions import ComputeTargetException
	from azureml.pipeline.core import Pipeline, PublishedPipeline, PipelineData, OutputPortBinding
	from azureml.pipeline.steps import PythonScriptStep, DataTransferStep, EstimatorStep, HyperDriveStep
	from azureml.train.estimator import Estimator
	from azureml.train.hyperdrive import HyperDriveRunConfig, PrimaryMetricGoal, uniform, quniform, choice, RandomParameterSampling, BayesianParameterSampling, MedianStoppingPolicy

	from azureml.widgets import RunDetails

	print("Azure ML SDK Version: ", azureml.core.VERSION)


	#%%
	project_name = 'attrition_pipe_anders'
	compute_target_name = 'dev-anders'
	data_factory_name = 'adf'


	#%%
	interactive_auth = InteractiveLoginAuthentication(tenant_id="cf36141c-ddd7-45a7-b073-111f66d0b30c")
	ws = Workspace.from_config(auth=interactive_auth)
	print("Found workspace {} at location {}".format(ws.name, ws.location))

	# create/get experiment
	exp = Experiment(workspace=ws, name=project_name)

	# Default datastore (Azure file storage)
	def_file_store = ws.get_default_datastore()
	print("Default datastore's name: {}".format(def_file_store.name))

	def_blob_store = Datastore(ws, "workspaceblobstore")
	print("Blobstore's name: {}".format(def_blob_store.name))


	#%%
	# Verify that cluster does not exist already
	try:
	compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
	print('Found existing cluster, use it.')
	except ComputeTargetException:
	compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
	max_nodes=4)
	compute_target = ComputeTarget.create(ws, compute_target_name, compute_config)

	compute_target.wait_for_completion(show_output=True)


	#%%
	def get_or_create_data_factory(workspace, factory_name):
	try:
	return DataFactoryCompute(workspace, factory_name)
	except ComputeTargetException as e:
	if 'ComputeTargetNotFound' in e.message:
	print('Data factory not found, creating...')
	provisioning_config = DataFactoryCompute.provisioning_configuration()
	data_factory = ComputeTarget.create(workspace, factory_name, provisioning_config)
	return data_factory
	else:
	raise e

	data_factory_compute = get_or_create_data_factory(ws, data_factory_name)

	print("setup data factory account complete")


	#%%
	batchai_run_config = RunConfiguration().load(path = './compute/', name = 'aml_compute')


	#%%
	# data references
	input_dir = DataReference(data_reference_name='input_data', datastore=def_blob_store, path_on_datastore='attrition_pipe')
	output_dir = DataReference(data_reference_name='output_data', datastore=def_blob_store, path_on_datastore='attrition_pipe/output')
	munged_hc = PipelineData('munged_hc', datastore=def_blob_store)
	munged_leaver = PipelineData('munged_leaver', datastore=def_blob_store)
	munged_roster = PipelineData('munged_roster', datastore=def_blob_store)
	munged_absence = PipelineData('munged_absence', datastore=def_blob_store)
	munged_productivity = PipelineData('munged_productivity', datastore=def_blob_store)
	munged_promo = PipelineData('munged_promo', datastore=def_blob_store)
	munged_time = PipelineData('munged_time', datastore=def_blob_store)
	munged_travel = PipelineData('munged_travel', datastore=def_blob_store)
	batch_scoring_data = PipelineData('batch_scoring', datastore=def_blob_store)
	hyperdrive_metrics = PipelineData('hyperdrive_metrics', datastore=def_blob_store)
	processed_data1 = PipelineData('processed_data1', datastore=def_blob_store)
	processed_data2 = PipelineData('processed_data2', datastore=def_blob_store)
	split_data = PipelineData('split_data', datastore=def_blob_store)
	tests_output = PipelineData('tests_output', datastore=def_blob_store)


	#%%
	munge_hc_step = PythonScriptStep(
	name='munge headcount',
	script_name='munge_headcount.py',
	arguments=['--input_dir', input_dir,
	'--output_dir', munged_hc],
	compute_target=compute_target,
	inputs=[input_dir],
	outputs=[munged_hc],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_leaver_step = PythonScriptStep(
	name='munge leaver',
	script_name='munge_leaver.py',
	arguments=['--input_dir', input_dir,
	'--output_dir', munged_leaver],
	compute_target=compute_target,
	inputs=[input_dir],
	outputs=[munged_leaver],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_roster_step = PythonScriptStep(
	name='munge roster',
	script_name='munge_roster.py',
	arguments=['--input_dir', input_dir,
	'--output_dir', munged_roster],
	compute_target=compute_target,
	inputs=[input_dir],
	outputs=[munged_roster],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_absence_step = PythonScriptStep(
	name='munge absence',
	script_name='munge_absence.py',
	arguments=['--input_dir', input_dir,
	'--munged_hc_dir', munged_hc,
	'--output_dir', munged_absence],
	compute_target=compute_target,
	inputs=[input_dir, munged_hc],
	outputs=[munged_absence],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_productivity_step = PythonScriptStep(
	name='munge productivity',
	script_name='munge_productivity.py',
	arguments=['--input_dir', input_dir,
	'--munged_hc_dir', munged_hc,
	'--output_dir', munged_productivity],
	compute_target=compute_target,
	inputs=[input_dir, munged_hc],
	outputs=[munged_productivity],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_promo_step = PythonScriptStep(
	name='munge promo',
	script_name='munge_promo.py',
	arguments=['--input_dir', input_dir,
	'--munged_hc_dir', munged_hc,
	'--output_dir', munged_promo],
	compute_target=compute_target,
	inputs=[input_dir, munged_hc],
	outputs=[munged_promo],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_time_step = PythonScriptStep(
	name='munge time',
	script_name='munge_time.py',
	arguments=['--input_dir', input_dir,
	'--munged_hc_dir', munged_hc,
	'--output_dir', munged_time],
	compute_target=compute_target,
	inputs=[input_dir, munged_hc],
	outputs=[munged_time],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	munge_travel_step = PythonScriptStep(
	name='munge travel',
	script_name='munge_travel.py',
	arguments=['--input_dir', input_dir,
	'--munged_hc_dir', munged_hc,
	'--output_dir', munged_travel],
	compute_target=compute_target,
	inputs=[input_dir, munged_hc],
	outputs=[munged_travel],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	join_step = PythonScriptStep(
	name='join',
	script_name='join.py',
	arguments=['--munged_hc_dir', munged_hc,
	'--munged_roster_dir', munged_roster,
	'--munged_absence_dir', munged_absence,
	'--munged_leaver_dir', munged_leaver,
	'--munged_productivity_dir', munged_productivity,
	'--munged_promo_dir', munged_promo,
	'--munged_time_dir', munged_time,
	'--munged_travel_dir', munged_travel,
	'--output_dir', processed_data2],
	compute_target=compute_target,
	inputs=[
	munged_hc,
	munged_roster,
	munged_absence,
	munged_leaver,
	munged_productivity,
	munged_promo,
	munged_time,
	munged_travel
	],
	outputs=[processed_data2],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)


	split_step = PythonScriptStep(
	name='Split data',
	script_name='get_data.py',
	arguments=['--input_dir', processed_data2,
	'--output_dir', split_data],
	compute_target=compute_target,
	inputs=[processed_data2],
	outputs=[split_data],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)

	batch_scoring_step = PythonScriptStep(
	name='Batch scoring',
	script_name='batch_scoring.py',
	arguments=[
	'--context', 'remote',
	'--model_name', 'attrition3_pipe_test',
	'--dataset_path', split_data,
	'--output_dir', batch_scoring_data],
	compute_target=compute_target,
	inputs=[split_data],
	outputs=[batch_scoring_data],
	runconfig=batchai_run_config,
	source_directory=os.path.join(os.getcwd(), 'compute'),
	allow_reuse=True
	)


	#%%
	# hyperdrive config
	est_config_aml = Estimator(
	source_directory = "./compute",
	entry_script= "train.py",
	compute_target = compute_target,
	environment_definition = batchai_run_config.environment
	)

	random_sampling = RandomParameterSampling( {
	'boosting_type' : choice('gbdt', 'dart'),
	'learning_rate': quniform(0.05, 0.1, 0.01),
	'num_leaves': quniform(4, 20, 1),
	"max_bin": quniform(50,300, 5),
	"min_child_samples": quniform(20,200, 5)
	} )

	hyperdrive_run_config = HyperDriveRunConfig(
	estimator=est_config_aml, # AML
	hyperparameter_sampling=random_sampling,
	primary_metric_name="geometric mean",
	primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
	max_total_runs=4,
	max_concurrent_runs=4)

	hyperdrive_step = HyperDriveStep(
	name='Hyperdrive',
	hyperdrive_run_config = hyperdrive_run_config,
	estimator_entry_script_arguments = [ "--input_dir", split_data, '--output_dir', hyperdrive_metrics],
	inputs=[split_data],
	metrics_output = hyperdrive_metrics,
	allow_reuse=True
	)


	#%%
	# save out intermediary files back to blob
	transfer_gold_step = DataTransferStep(
	name="transfer_gold",
	source_data_reference=processed_data2,
	destination_data_reference=output_dir,
	source_reference_type='directory',
	destination_reference_type='directory',
	compute_target=data_factory_compute,
	allow_reuse=True
	)

	transfer_output_step = DataTransferStep(
	name="transfer_output",
	source_data_reference=batch_scoring_data,
	destination_data_reference=output_dir,
	source_reference_type='directory',
	destination_reference_type='directory',
	compute_target=data_factory_compute,
	allow_reuse=True
	)

	transfer_metrics_step = DataTransferStep(
	name="transfer_metrics",
	source_data_reference=hyperdrive_metrics,
	destination_data_reference=output_dir,
	source_reference_type='directory',
	destination_reference_type='directory',
	compute_target=data_factory_compute,
	allow_reuse=True
	)


	#%%
	pipeline = Pipeline(workspace=ws,
	steps=[
	# transfer_gold_step,
	# transfer_output_step,
	transfer_metrics_step
	])


	#%%
	pipeline_run = Experiment(ws, project_name).submit(pipeline, pipeline_params={})


	#%%
	RunDetails(pipeline_run).show()


	#%%
	# pipeline_run.cancel()


	#%%
	pipeline_run