🛠️

Shashank shashankprasanna

🛠️

Talking Engineer. Machine Learner.

shashankprasanna / smdebug_rule_invoke.py

Created April 2, 2020 21:19

	from smdebug.rules import invoke_rule
	from smdebug.trials import create_trial

	trial = create_trial(path=’./smd_outputs/<JOB_NAME>)
	rule_obj = CustomVanishingGradientRule(trial, threshold=0.0001)
	invoke_rule(rule_obj, start_step=0, end_step=None)

shashankprasanna / keras-checkpoint.py

Created April 24, 2020 22:00

	from tensorflow.keras.callbacks import ModelCheckpoint
	checkpoint_path = "/opt/ml/checkpoints"
	checkpoint_names = 'cifar10-'+model_type+'.{epoch:03d}.h5'
	checkpoint_callback = ModelCheckpoint(filepath=f'{checkpoint_path}/{checkpoint_names}',
	save_weights_only=False,
	monitor='val_loss')

	model.fit(train_dataset, ...
	epochs=epochs,
	initial_epoch=epoch_number,

shashankprasanna / load_checkpoint.py

Created April 24, 2020 22:04

	Import os, re
	from tensorflow.keras.models import load_model
	def load_checkpoint_model(checkpoint_path):
	files = [f for f in os.listdir(checkpoint_path) if f.endswith('.' + 'h5')]
	epoch_numbers = [re.search('(?<=\.)(.*[0-9])(?=\.)',f).group() for f in files]

	max_epoch_number = max(epoch_numbers)
	max_epoch_index = epoch_numbers.index(max_epoch_number)
	max_epoch_filename = files[max_epoch_index]

shashankprasanna / sagemaker-spot.py

Created April 24, 2020 22:09

	from sagemaker.tensorflow import TensorFlow
	bucket_name = sagemaker_session.default_bucket()
	output_path = f's3://{bucket_name}/jobs'
	job_name = 'tensorflow-spot-job'

	tf_estimator = TensorFlow(entry_point = 'cifar10-training-sagemaker.py',
	role = role,
	train_instance_count = 1,
	train_instance_type = 'ml.p3.2xlarge',
	framework_version = '1.15',

shashankprasanna / sagemaker-spot-simulate.py

Created April 24, 2020 22:23

	tf_estimator = TensorFlow(entry_point = 'cifar10-training-sagemaker.py',
	...
	...
	checkpoint_s3_uri = tf_estimator.checkpoint_s3_uri,
	train_use_spot_instances = True,
	train_max_wait = 7200)

shashankprasanna / sm-experiments-1.py

Created July 13, 2020 22:57

	from smexperiments.experiment import Experiment
	sm = boto3.Session().client('sagemaker')

	training_experiment = Experiment.create(
	experiment_name = f"cifar10-training-experiment",
	description = "Hypothesis: If I use my custom image classification model, it will deliver better accuracy compared to a ResNet50 model on the CIFAR10 dataset",
	sagemaker_boto_client=sm)

shashankprasanna / sm-experiments-2.py

Created July 13, 2020 23:38

	hyperparam_options = {'optimizer': ['adam', 'sgd', 'rmsprop'],
	'model': ['resnet', 'custom'],
	'epochs': [30, 60, 120]}

	hypnames, hypvalues = zip(*hyperparam_options.items())
	trial_hyperparameter_set = [dict(zip(hypnames, h)) for h in itertools.product(*hypvalues)]
	trial_hyperparameter_set

shashankprasanna / sm-experiments-3.py

Created July 13, 2020 23:38

	static_hyperparams={'batch-size' : 128,
	'learning-rate': 0.001,
	'weight-decay' : 1e-6,
	'momentum' : 0.9}

shashankprasanna / sm-experiments-4.py

Created July 13, 2020 23:39

	with Tracker.create(display_name="experiment-metadata",
	artifact_bucket=bucket_name,
	artifact_prefix=training_experiment.experiment_name,
	sagemaker_boto_client=sm) as exp_tracker:
	exp_tracker.log_input(name="cifar10-dataset", media_type="s3/uri", value=datasets)
	exp_tracker.log_parameters(static_hyperparams)
	exp_tracker.log_parameters(hyperparam_options)
	exp_tracker.log_artifact(file_path='generate_cifar10_tfrecords.py')

shashankprasanna / sm-experiments-5.py

Created July 13, 2020 23:40

	for trial_hyp in trial_hyperparameter_set:
	# Combine static hyperparameters and trial specific hyperparameters
	hyperparams = {static_hyperparams, trial_hyp}

	# Create unique job name with hyperparameter and time
	time_append = int(time.time())
	hyp_append = "-".join([str(elm) for elm in trial_hyp.values()])
	job_name = f'cifar10-training-{hyp_append}-{time_append}'

	# Create a Tracker to track Trial specific hyperparameters