Skip to content

Instantly share code, notes, and snippets.

View shashankprasanna's full-sized avatar
🛠️

Shashank shashankprasanna

🛠️
View GitHub Profile
debug_rules = [
Rule.sagemaker(rule_configs.overtraining()),
Rule.sagemaker(rule_configs.overfit()),
Rule.custom(name='MyCustomRule',
image_uri='840043622174.dkr.ecr.us-east-2.amazonaws.com/sagemaker-debugger-rule-evaluator:latest',
instance_type='ml.t3.medium',
source='rules/my_custom_rule.py',
rule_to_invoke='CustomGradientRule',
volume_size_in_gb=30,
rule_parameters={"threshold": "20.0"})
for epoch in range(10):
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
inputs, labels = data[0].to(device), data[1].to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
# Use hook to record tensors
hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
import smdebug.pytorch as smd
net = get_network()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
job_name = 'pytorch-debug-job'
hook = smd.Hook(out_dir=f'./smd_outputs/{job_name}',
save_config=smd.SaveConfig(save_interval=10),
include_collections=['gradients', 'biases'])
import smdebug.tensorflow as smd
job_name = 'tf-debug-job'
hook = smd.KerasHook(out_dir=f'./smd_outputs/{job_name}',
tensorboard_dir=f'./tb_logs/{job_name}',
save_config=smd.SaveConfig(save_interval=1),
include_collections=['gradients', 'biases'])
opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, name='SGD')
opt = hook.wrap_optimizer(opt)
tf_estimator = TensorFlow(entry_point = 'tf-training-script.py',
...
...
debugger_hook_config = debugger_hook_config)
from sagemaker.debugger import Rule, DebuggerHookConfig
debugger_hook_config = DebuggerHookConfig(
hook_parameters={"save_interval": '100'},
collection_configs=[
CollectionConfig("losses"),
CollectionConfig("weights"),
CollectionConfig("gradients"),
CollectionConfig("biases")]
)
train_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/train'
val_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/validation'
eval_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/eval'
hvd_estimator.fit({'train': train_path,'validation': val_path,'eval': eval_path},
job_name=job_name, wait=False)
from sagemaker.tensorflow import TensorFlow
hvd_estimator = TensorFlow(entry_point = 'cifar10-tf-horovod-sagemaker.py',
source_dir = 'code',
output_path = output_path + '/',
code_location = output_path,
role = role,
train_instance_count = hvd_instance_count,
train_instance_type = hvd_instance_type,
train_volume_size = 50,
framework_version = '1.15',
# Change 8: Update script to accept hyperparameters as command line arguments
parser = argparse.ArgumentParser()
# Hyper-parameters
parser.add_argument('--epochs', type=int, default=15)
parser.add_argument('--learning-rate', type=float, default=0.001)
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--weight-decay', type=float, default=2e-4)
parser.add_argument('--momentum', type=float, default='0.9')
parser.add_argument('--optimizer', type=str, default='adam')
history = model.fit(train_dataset,
steps_per_epoch = (NUM_TRAIN_IMAGES // batch_size) // size,
validation_data = val_dataset,
validation_steps = (NUM_VALID_IMAGES // batch_size) // size,
verbose = 1 if hvd.rank() == 0 else 0,
epochs = epochs, callbacks=callbacks)