Skip to content

Instantly share code, notes, and snippets.

@brusMX
Last active May 17, 2018 02:11
Show Gist options
  • Save brusMX/5bebf3f82fea36e8c167dad7daaa5f1f to your computer and use it in GitHub Desktop.
Save brusMX/5bebf3f82fea36e8c167dad7daaa5f1f to your computer and use it in GitHub Desktop.
.vscode
.env
*.env
*.env.BACKUP
*.env.BACKUP*
job.json
job-prep.sh
*.zip
#!/bin/bash
# Interactively create a Batch AI cluster
# Author: Bruno Medina (@brusmx)
# Requirements:
# - Azure Cli >= 2.0.26
# - cut
# - SSH client
# Example of usage:
# chmod +x provision-batch-AI-cluster.sh
# ./provision-batch-AI-cluster.sh
set -e
readonly MIN_CLI_VERSION=2.0.26
readonly CONFIG_FILE_NAME="cluster-conf.env"
readonly SSH_PUB_LOCATION=~/.ssh/id_rsa.pub
readonly SSH_PRIV_LOCATION=~/.ssh/id_rsa
readonly CHECKS_FILE="checks.env"
# Output terminal colors
readonly RED='\033[0;31m'
readonly BLUE='\033[0;34m'
readonly L_BLUE='\033[1;34m'
readonly YELLOW='\033[0;33m'
readonly GREEN='\033[0;32m'
readonly PURPLE='\033[0;35m'
readonly CYAN='\033[0;36m'
readonly GRAY='\033[1;37m'
readonly NC='\033[0m'
# =========================================================================
# =================== HELPER FUNCTIONS ====================================
# =========================================================================
function version_gt() { test "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1"; }
function write_status(){
cat <<EOT > ${CHECKS_FILE}
export SSH_OK=`echo $SSH_OK`
export CLI_OK=`echo $CLI_OK`
export AZURE_SUBSCRIPTION_ID="`echo $AZURE_SUBSCRIPTION_ID`"
export AZURE_SUBSCRIPTION_NAME="`echo $AZURE_SUBSCRIPTION_NAME`"
export PROVIDER_REGISTERED=`echo $PROVIDER_REGISTERED`
EOT
}
function clean_session(){
export SSH_OK=
export CLI_OK=
export AZURE_SUBSCRIPTION_ID=
export AZURE_SUBSCRIPTION_NAME=
export PROVIDER_REGISTERED=
}
function welcome () {
echo -e "${L_BLUE}"
cat << "EOF"
################################################################
# ____ _ _ _ ___ #
# | __ ) __ _| |_ ___| |__ / \ |_ _| #
# | _ \ / _` | __/ __| '_ \ / _ \ | | #
# | |_) | (_| | || (__| | | | / ___ \ | | #
# |____/ \__,_|\__\___|_| |_|_/_/ \_\___| _ #
# | ____|__ _ ___ _ _ / ___| |_ _ ___| |_ ___ _ __ #
# | _| / _` / __| | | | | | | | | | / __| __/ _ \ '__| #
# | |__| (_| \__ \ |_| | | |___| | |_| \__ \ || __/ | #
# |_____\__,_|___/\__, | \____|_|\__,_|___/\__\___|_| #
# |___/ #
# #
################################################################
Interactive bash script to provision an Azure Batch AI cluster
Author: Bruno Medina (@brusmx)
EOF
echo -e "${NC}"
}
# =========================================================================
# =================== REQUIREMENT CHECKS ==================================
# =========================================================================
function run_general_checks(){
if [ -z "$SSH_OK" ]; then
#Check if SSH was ok
check_ssh
export SSH_OK=1
write_status
fi
if [ -z "$CLI_OK" ]; then
#Check if CLI was ok
check_cli
export CLI_OK=1
write_status
fi
}
function run_sub_checks() {
#Choose a sub
if [ ! -z "$AZURE_SUBSCRIPTION_ID" ]; then
#Check if sub was already selected
local PREV_SUB_ID=$AZURE_SUBSCRIPTION_ID
local PREV_SUB_NAME=$AZURE_SUBSCRIPTION_NAME
else
export PROVIDER_REGISTERED=''
write_status
choose_subscription
fi
if [ ! -z "$PREV_SUB_ID" ]; then
echo -e "${YELLOW}- Current Azure subscription: \"${PREV_SUB_NAME}\" - (${PREV_SUB_ID})${NC}"
local NEW_NAME=$CONFIG_FILE_NAME.BACKUP.${RANDOM:0:5}
echo -e " Press [${GREEN}Enter${NC}] to proceed with this subscription"
echo -e " ${GRAY} Or press 'n' on your keyboard to change it${NC}"
read -n 1 -r
if [ ! $REPLY == $'\x0a' ]; then
choose_subscription
if [ ! "$PREV_SUB_ID" == "$AZURE_SUBSCRIPTION_ID" ]; then
export PROVIDER_REGISTERED=''
write_status
echo -e "${BLUE} Saving current file as '${CLUSTER_NAME}-${CONFIG_FILE_NAME}' and starting over${NC}"
mv $CONFIG_FILE_NAME $CLUSTER_NAME-$CONFIG_FILE_NAME
fi
fi
fi
if [ -z "$PROVIDER_REGISTERED" ]; then
#Check if SSH was ok
check_providers
export PROVIDER_REGISTERED=1
write_status
fi
}
function check_ssh () {
echo -e "${YELLOW}- Checking your SSH installation ...${NC}"
local HAS_SSH=$(which ssh)
if [ -z "$HAS_SSH" ]; then
echo -e "${RED}SSH was not found. You need to install SSH and create an SSH Key without passphrase.${NC}"
exit 1
else
export SSH_PUB_KEY="`cat ${SSH_PUB_LOCATION}`"
if [ -z "$SSH_PUB_KEY" ]; then
echo -e "${YELLOW}SSH public key was not found. Creating one ...${NC}"
ssh-keygen -b 4096 -t rsa -q -N "${USER}@$HOSTNAME" -f ~/.ssh/id_rsa
export SSH_PUB_KEY="`cat ${SSH_PUB_LOCATION}`"
if [ -z "$SSH_PUB_KEY" ]; then
echo -e "${RED}ERROR. SSH key couldn't be created. Please make sure you have an SSH key created on ${NC}"
exit 1
fi
fi
echo -e " ${GREEN}Cool :) SSH is installed and your public key "${SSH_PUB_LOCATION}" is accessible.${NC}"
fi
}
function check_cli () {
# Check that Azure CLI is installed
echo -e "${YELLOW}- Making sure you have a new-ish Azure CLI version...${NC}"
local AZ_CLI_VERSION=$(az --version | cut -d')' -f1 | cut -d'(' -f2 | head -1| cut -d' ' -f2)
if [ -z "$AZ_CLI_VERSION" ]; then
echo -e "${RED}Azure CLI version could not be retrieved. Is it installed correctly?"
echo -e "Make sure you have latest version of the Azure CLI:"
echo -e "${BLUE}https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest${NC}"
exit 1
else
# Check Azure CLI version is new-ish
if version_gt $MIN_CLI_VERSION $AZ_CLI_VERSION; then
echo -e "${RED}Your version of the Azure CLI is outdated, please upade it:"
echo -e "${BLUE}https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest${NC}"
exit 1
fi
echo -e " ${GREEN}Cool :) You have Azure CLI version: ${AZ_CLI_VERSION}${NC}"
fi
}
function check_providers() {
# Check that Batch and Batch AI providers are registered
local PROVIDER1="Microsoft.Batch"
local PROVIDER2="Microsoft.BatchAI"
register_provider $PROVIDER1
register_provider $PROVIDER2
}
function register_provider(){
echo -e "${YELLOW}- Checking if provider $1 is registered ...${NC}"
local REGISTERED=$(az provider show -n $1 -o tsv | cut -f4)
if [ "$REGISTERED" = "Registered" ]; then
echo -e " ${GREEN}${1} is '${REGISTERED}'${NC}"
else
echo -e "${BLUE}${1} is '${REGISTERED}'... registering${NC}"
az provider register -n $1
fi
}
function choose_subscription () {
echo -e "${YELLOW}- Obtaining your Azure subsctiptions...${NC}"
DEFAULT_ACCOUNT=`az account show -o tsv`
DEFAULT_ACCOUNT_ID=$(printf %s "$DEFAULT_ACCOUNT" | cut -f2)
# Check Azure is logged in
if [ -z "$DEFAULT_ACCOUNT_ID" ]; then
echo -e "${RED}Your subscription couldn't be found, make sure you have logged in before re-running this script.${NC}"
exit 1
else
export DEFAULT_ACCOUNT_NAME=`printf %s "$DEFAULT_ACCOUNT" | cut -f4`
echo -e "Current subscription (selected): ${BLUE}\"${DEFAULT_ACCOUNT_NAME}\"${NC} - ${GREEN}(${DEFAULT_ACCOUNT_ID})${NC}"
echo ""
export ACCOUNT_LIST=`az account list -o tsv`
export ACCOUNT_LIST_ID=`printf %s "$ACCOUNT_LIST" | cut -f2`
export ACCOUNT_LIST_NAMES=`printf %s "$ACCOUNT_LIST" | cut -f4`
export ACCOUNT_LIST_SIZE=`echo "$ACCOUNT_LIST" | wc -l`
# Ask user to select subscription
echo "Found $ACCOUNT_LIST_SIZE enabled subscription(s) in your Azure Account:"
echo ""
export COUNT=1
IFS=$'\n'
set -f
for line in $(printf %s "$ACCOUNT_LIST"); do
echo -e "${COUNT}) $(printf %s "$line" | cut -f4 ) || ($(echo $line | cut -f2 ))"
((COUNT++))
done
set +f
unset IFS
echo ""
echo -e "Select a subscription ${YELLOW}(1-`expr ${ACCOUNT_LIST_SIZE}`)${NC} or press [${GREEN}Enter${NC}] to continue with ${BLUE}\"${DEFAULT_ACCOUNT_NAME}\"${NC} subscription:"
read selection
if [ -z "$selection" ]; then
export AZURE_SUBSCRIPTION_ID=$DEFAULT_ACCOUNT_ID
export AZURE_SUBSCRIPTION_NAME=$DEFAULT_ACCOUNT_NAME
elif [ "$selection" -gt 0 ] && [ "$selection" -le "${ACCOUNT_LIST_SIZE}" ]; then
export AZURE_SUBSCRIPTION_ID=$(sed -n ${selection}p <<< "$ACCOUNT_LIST_ID")
export AZURE_SUBSCRIPTION_NAME=$(sed -n ${selection}p <<< "$ACCOUNT_LIST_NAMES")
else
echo "Incorrect selection, Cluster not created"
exit 1
fi
az account set -s ${AZURE_SUBSCRIPTION_ID}
echo -e " ${GREEN}Selected ${AZURE_SUBSCRIPTION_NAME} - (${AZURE_SUBSCRIPTION_ID})${NC}"
fi
}
function reset_variables(){
export RG=
export LOC=
export RG_STATUS=
export STO_ACC_NAME=
export STO_ACC_STATUS=
export STO_FILE_SHARE=
export STO_DIR=
export STO_CONN=
export STO_FILE_SHARE_STATUS=
export STO_DIR_STATUS=
export CLUSTER_NAME=
export CLUSTER_SKU=
export CLUSTER_AGENT_COUNT=
export CLUSTER_USERNAME=
export CLUSTER_SSH_KEY=
export CLUSTER_PASSWORD=
export CLUSTER_STATUS=
export CLUSTER_IP=
export CLUSTER_AGENT_PORT=
}
function save_cluster_config () {
cat <<EOT > ${CONFIG_FILE_NAME}
# Resource group
export RG="`echo $RG`"
export LOC=`echo $LOC`
export RG_STATUS="`echo $RG_STATUS`"
# Storage Account
export STO_ACC_NAME="`echo $STO_ACC_NAME`"
export STO_ACC_STATUS="`echo $STO_ACC_STATUS`"
export STO_FILE_SHARE="`echo $STO_FILE_SHARE`"
export STO_DIR="`echo $STO_DIR`"
export STO_CONN="`echo $STO_CONN`"
export STO_FILE_SHARE_STATUS=$STO_FILE_SHARE_STATUS
export STO_DIR_STATUS=$STO_DIR_STATUS
# Batch AI cluster
export CLUSTER_NAME="`echo $CLUSTER_NAME`"
export CLUSTER_SKU="`echo $CLUSTER_SKU`"
export CLUSTER_AGENT_COUNT=`echo $CLUSTER_AGENT_COUNT`
export CLUSTER_USERNAME="`echo $CLUSTER_USERNAME`"
export CLUSTER_SSH_KEY="`echo $CLUSTER_SSH_KEY`"
export CLUSTER_PASSWORD="`echo $CLUSTER_PASSWORD`"
export CLUSTER_STATUS="`echo $CLUSTER_STATUS`"
export CLUSTER_IP=`echo $CLUSTER_IP`
export CLUSTER_AGENT_PORT=`echo $CLUSTER_AGENT_PORT`
EOT
}
function create_config_file() {
reset_variables
echo -e "${YELLOW}- Choose one of the following VM SKUs for your cluster:${NC}"
echo
echo -e " ${L_BLUE}1) Basic.${NC} Standard NC6 Nodes with standard configuration"
echo -e " Each node has 1 x K80 GPU Card (1/2 Physical Card)."
echo -e " 6 Cores CPU. RAM 56 GB. 380 GB of SSD."
echo
echo -e " ${BLUE}2) Medium.${NC} Standard NC12 Nodes with standard configuration"
echo -e " Each node has 2 x K80 GPU Card (1 Physical Card)."
echo -e " 12 Cores CPU. RAM 112 GB. 680 GB of SSD."
echo
echo -e " ${PURPLE}3) Big.${NC} Standard NC24r Nodes with Infiniband"
echo -e " Each node has 4 x K80 GPU Card (2 Physical Cards)."
echo -e " 24 Cores CPU. RAM 224 GB. 1.44 TB of SSD."
echo -e " ${RED}*Note:${NC} ${GRAY}NC24r Batch AI Quota needs to be available on your subscription"
echo -e " Dedicated instances are provided through a Support Ticket.${NC}"
echo
echo -e "Select an option ${BLUE}(1-3)${NC} (or press [${GREEN}Enter${NC}] to continue with a ${L_BLUE}\"Basic\"${NC} cluster):"
read selection
local VM_SKU="STANDARD_NC6"
if [ ! -z "$selection" ]; then
if [ "$selection" -gt 1 ] && [ "$selection" -le "3" ]; then
if [ "$selection" -eq "2" ]; then
local VM_SKU="STANDARD_NC12"
elif [ "$selection" -eq "3" ]; then
local VM_SKU="STANDARD_NC24r"
fi
fi
fi
echo -e "How many nodes ${BLUE}(1-50)${NC} (or press [${GREEN}Enter${NC}] to continue with the ${L_BLUE}\"2\"${NC} nodes):"
read selection
local NODES_QTY=2
if [ ! -z "$selection" ]; then
if [ "$selection" -gt 0 ] && [ "$selection" -le "51" ]; then
local NODES_QTY=$selection
fi
fi
export RG=batchai-rg-`echo ${RANDOM:0:4}`
export LOC=eastus
export STO_ACC_NAME=b4tch`echo ${RANDOM:0:4}`clust3r`echo ${RANDOM:0:5}`
export STO_FILE_SHARE=external
export STO_DIR=storagedir
export CLUSTER_NAME=batchaicluster`echo ${RANDOM:0:4}`
export CLUSTER_SKU=`echo ${VM_SKU}`
export CLUSTER_AGENT_COUNT=`echo ${NODES_QTY}`
export CLUSTER_USERNAME=`echo $USER`
export CLUSTER_SSH_KEY="`cat ~/.ssh/id_rsa.pub`"
export CLUSTER_PASSWORD=m1-`echo ${RANDOM:0:3}`-s3cuR3-`echo ${RANDOM:0:2}`-P4ssW0rd!
save_cluster_config
print_conf_file
}
function print_conf_file() {
reset_variables
source $CONFIG_FILE_NAME
echo -e "${BLUE}*****************************************************************${NC}"
echo -e "${BLUE}**~~-- Your Batch AI cluster configuration --~~**${NC}"
echo -e "${BLUE}*****************************************************************${NC}"
local CLUSTER_STATUS_TEXT="${YELLOW}Not yet provisioned${NC}"
if [ ! -z $CLUSTER_STATUS ]; then
local CLUSTER_STATUS_TEXT="${LBLUE}${CLUSTER_STATUS}${NC}"
if [ "$CLUSTER_STATUS" == "steady" ]; then
local CLUSTER_STATUS_TEXT="${GREEN}Up and ready to go${NC}"
if [ ! -z "$CLUSTER_IP" ]; then
local SSH_CONNECTION=" ${GREEN}ssh $CLUSTER_USERNAME@$CLUSTER_IP -p $CLUSTER_AGENT_PORT -i ${SSH_PRIV_LOCATION}${NC}"
fi
fi
fi
echo -e "Full Cluster eployment status: "
echo -e $CLUSTER_STATUS_TEXT
echo
echo -e "Cluster '${PURPLE}${CLUSTER_NAME}${NC}'"
echo -e " ${YELLOW}${CLUSTER_AGENT_COUNT}${NC} '${CYAN}${CLUSTER_SKU}${NC}' nodes"
echo -e "VMs username:'${GRAY}${CLUSTER_USERNAME}${NC}'"
echo -e "Root password:"
echo -e "'${GREEN}${CLUSTER_PASSWORD}${NC}'"
echo
echo -e "Resource Group '${BLUE}${RG}${NC}' located in '${CYAN}${LOC}${NC}'."
echo -e "Located in '${CYAN}${LOC}${NC}'. $RG_STATUS"
echo
echo -e "Storage account: '${BLUE}${STO_ACC_NAME}${NC}'"
echo $STO_ACC_STATUS
echo -e "File share: '${CYAN}${STO_FILE_SHARE}${NC}'. $STO_FILE_SHARE_STATUS"
echo -e
echo -e "Storage Directory: '${CYAN}${STO_DIR}${NC}. $STO_DIR_STATUS"
echo -e
if [ ! -f $STO_CONN ]; then
echo -e "with connection string \"${GRAY}${STO_CONN}${NC}\"" | fold -w 65
fi
echo
echo -e "SSH public key: '${SSH_PUB_LOCATION}'" | fold -w 65
echo
echo -e $SSH_CONNECTION
echo -e "${BLUE}*****************************************************************${NC}"
echo
}
function check_conf_file() {
reset_variables
source $CONFIG_FILE_NAME
echo -e "${YELLOW}- Checking integrity of config file ...${NC}"
if [[ -z $RG || -z $LOC || -z $STO_ACC_NAME || -z $STO_FILE_SHARE || -z $STO_DIR || -z $CLUSTER_NAME || -z $CLUSTER_SKU || -z $CLUSTER_AGENT_COUNT || -z $CLUSTER_SSH_KEY || -z $CLUSTER_USERNAME || -z $CLUSTER_PASSWORD ]]; then
echo -e "${RED} One or more variables are undefined${NC}"
exit 1
else
echo -e "${GRAY}This is the last cluster configuration:${NC}"
echo
print_conf_file
fi
}
function configure() {
# Checking for previous env file
echo -e "${YELLOW}- Checking for previous configuration file '${CONFIG_FILE_NAME}'. ${NC}"
if [ -f $CONFIG_FILE_NAME ]; then
echo -e "${GREEN}Found '${CONFIG_FILE_NAME}'${NC}"
check_conf_file
echo -e "${YELLOW}- Would you like to use this configuration to create a new cluster?${NC}"
local NEW_NAME="${CLUSTER_NAME}-${CONFIG_FILE_NAME}"
echo -e " Press [${GREEN}Enter${NC}] to proceed."
echo -e " ${GRAY}Or press 'n' to save your current config in '${NEW_NAME}' and start a new one${NC}"
read -n 1 -r
if [ ! $REPLY == $'\x0a' ]; then
echo -e "${BLUE} Saving current file '${NEW_NAME}', and running the configuration tool${NC}"
mv $CONFIG_FILE_NAME $NEW_NAME
echo
create_config_file
fi
# Source environment
clean_session
source ${CONFIG_FILE_NAME}
else
echo " No previous configuration file found"
echo
create_config_file
fi
}
function deploy() {
echo
echo -e "${YELLOW}- We are ready to deploy your cluster. Do you want to continue?${NC}"
echo -e " Press [${GREEN}Enter${NC}] to continue. Or to cancel [Ctrl] + [C]."
read -p " You can modify ${CONFIG_FILE_NAME} manually and restart the script" -n 1 -r
echo -e "${NC}"
source ${CONFIG_FILE_NAME}
echo
if [ ! "$RG_STATUS" == "Succeeded" ]; then
echo -e "${YELLOW}- Creating resource group '${RG}' in '${LOC}'${NC}"
export RG_STATUS=$(az group create --name $RG --location $LOC --query properties.provisioningState -o tsv)
if [ -z "$RG_STATUS" ]; then
echo -e "${RED}Error: Your resource group was not created${NC}"
exit 1
else
echo -e " ${GREEN}Created.${NC}"
fi
save_cluster_config
echo
fi
if [ ! "$STO_ACC_STATUS" == "Succeeded" ]; then
echo -e "${YELLOW}- Creating storage account '${STO_ACC_NAME}'${NC}"
export STO_ACC_STATUS=$(az storage account create --name $STO_ACC_NAME -g $RG --sku Standard_LRS --query provisioningState -o tsv)
if [ -z "$STO_ACC_STATUS" ]; then
echo -e "${RED}Error: Your storage account was not created${NC}"
exit 1
else
echo -e " ${GREEN}Created.${NC}"
fi
save_cluster_config
echo
fi
if [ -z "$STO_CONN" ]; then
echo -e "${YELLOW}- Obtaining storage account connection string...${NC}"
export STO_CONN=`az storage account show-connection-string -g $RG -n $STO_ACC_NAME -o tsv`
save_cluster_config
echo -e "${GREEN} Exported storage account connection string to config file${NC}"
echo
fi
if [ -z "$STO_FILE_SHARE_STATUS" ]; then
echo -e "${YELLOW}- File shared '${STO_FILE_SHARE}' created?${NC}"
export STO_FILE_SHARE_STATUS=$(az storage share create --account-name $STO_ACC_NAME --name $STO_FILE_SHARE --connection-string $STO_CONN -o tsv)
save_cluster_config
echo -e " ${GREEN}${STO_FILE_SHARE_STATUS}${NC}"
echo
fi
if [ -z "$STO_DIR_STATUS" ]; then
echo -e "${YELLOW}- Directory '${STO_DIR}' created?${NC}"
export STO_DIR_STATUS=$(az storage directory create --share-name $STO_FILE_SHARE --name $STO_DIR --connection-string $STO_CONN -o tsv)
echo -e " ${GREEN}${STO_DIR_STATUS}${NC}"
echo
fi
if [ -z "$CLUSTER_STATUS" ]; then
echo -e "${YELLOW}- Create Batch AI cluster '$CLUSTER_NAME'${NC}"
export CLUSTER_STATUS=$(az batchai cluster create --name $CLUSTER_NAME --vm-size $CLUSTER_SKU \
--image UbuntuLTS --min $CLUSTER_AGENT_COUNT --max $CLUSTER_AGENT_COUNT --storage-account-name $STO_ACC_NAME \
--afs-name $STO_FILE_SHARE --afs-mount-path $STO_FILE_SHARE \
--user-name $CLUSTER_USERNAME --ssh-key "$CLUSTER_SSH_KEY" --password $CLUSTER_PASSWORD \
--resource-group $RG --location $LOC -o tsv | cut -f2)
if [ "$CLUSTER_STATUS" == "resizing" ]; then
save_cluster_config
echo
echo -e " ${GREEN}Your cluster is being provisioned and currently is "${CLUSTER_STATUS}". You can re-run this script again in any time${NC}"
else
echo -e "${RED}ERROR:${NC} Cluster could not be provisioned"
az batchai cluster show -n $CLUSTER_NAME -g $RG -o table --query errors
echo
echo -e "If you are getting quota errors for NC24r, make sure to create a support ticket in the portal with the amount of dedicated cores needed in your subscription"
exit
fi
fi
}
function check_cluster() {
echo
echo -e "${YELLOW}- Waiting for cluster to be up and running ${NC}"
reset_variables
source $CONFIG_FILE_NAME
local CLUSTER_STATUS_PREV=$CLUSTER_STATUS
export CLUSTER_STATUS=$(az batchai cluster show -n ${CLUSTER_NAME} -g ${RG} -o tsv | cut -f2)
echo -ne " ${CLUSTER_STATUS}"
while [ ! "$CLUSTER_STATUS" == "steady" ];
do
sleep 5
echo -ne "."
export CLUSTER_STATUS=$(az batchai cluster show -n ${CLUSTER_NAME} -g ${RG} -o tsv | cut -f2)
if [[ ! -z "$CLUSTER_STATUS" && ! "$CLUSTER_STATUS" == "$CLUSTER_STATUS_PREV" ]]; then
echo
echo -e " ${GREEN}Status changed to: ${CLUSTER_STATUS}${NC}"
save_cluster_config
fi
done
echo
}
function get_connection_strings() {
echo
echo -e "${YELLOW}- Retrieving connection settings for a node in your cluster: '$CLUSTER_NAME' ${NC}"
export CLUSTER_IP=`az batchai cluster list-nodes -g $RG -n $CLUSTER_NAME --query "[0].ipAddress" -o tsv`
echo
echo -e " Checking Nodes status (If they are still in 'preparing', you might not be able to deploy a job) ..."
if [ "$CLUSTER_STATUS" == "steady" ]; then
az batchai cluster show -g $RG -n $CLUSTER_NAME -o table
az batchai cluster list-nodes -g $RG -n $CLUSTER_NAME -o table
echo
fi
if [ ! -z $CLUSTER_IP ]; then
local CLUSTER_AGENT_PORT_D=`az batchai cluster list-nodes -g $RG -n $CLUSTER_NAME --query "[0].port" -o tsv`
export CLUSTER_AGENT_PORT=`echo ${CLUSTER_AGENT_PORT_D%.*}`
save_cluster_config
echo -e " You can connect to your cluster node with the following SSH command:"
echo -e " ${GREEN}ssh $CLUSTER_USERNAME@$CLUSTER_IP -p $CLUSTER_AGENT_PORT -i $SSH_PRIV_LOCATION${NC}"
echo
echo -e " Remember that the folder where the storage is mounted is the following:"
echo -e " cd /mnt/batch/tasks/shared/LS_root/mounts/${STO_FILE_SHARE}/${STO_DIR}"
echo
echo -e " for Azure File Shares, you can also use Azure Storage explorer with the following connection string:"
echo -e " "$STO_CONN
echo
else
echo -e " ${RED}ERROR: Could not retrieve the cluster public Ip${NC}"
echo
fi
}
function delete_current_resource_group() {
echo -e "${RED}- Are you sure to delete resource group '${RG}' containing cluster '$CLUSTER_NAME' and storage account '${STO_ACC_NAME}'"
echo -e " ${RED}in subscription \"${AZURE_SUBSCRIPTION_NAME} - ${AZURE_SUBSCRIPTION_ID}\"?${NC}"
read -r -p " Type 'yes' or 'y' to confirm... " response
case "$response" in
[yY][eE][sS]|[yY])
az group delete -n $RG -y --no-wait
rm -rf $CONFIG_FILE_NAME
echo -e " ${RED}Deleting...${NC}"
exit 1
;;
*)
cluster_options
;;
esac
}
function create_sample_job () {
clean_session
source $CONFIG_FILE_NAME
create_job_prep
echo -e "${YELLOW}- Uploading job-prep.sh file to '$CLUSTER_NAME'${NC}"
local AFS_DIRECTORY="/mnt/batch/tasks/shared/LS_root/mounts/${STO_FILE_SHARE}/${STO_DIR}"
echo "scp -i $SSH_PRIV_LOCATION -o StrictHostKeyChecking=no -P $CLUSTER_AGENT_PORT job-prep.sh $CLUSTER_USERNAME@$CLUSTER_IP:$AFS_DIRECTORY "
scp -i $SSH_PRIV_LOCATION -o StrictHostKeyChecking=no -P $CLUSTER_AGENT_PORT job-prep.sh $CLUSTER_USERNAME@$CLUSTER_IP:$AFS_DIRECTORY
echo
write_job
run_job
}
function run_job() {
export JOB_NAME="sample-job-${RANDOM:0:5}"
echo -e "${YELLOW}- Running job '$JOB_NAME' on '$CLUSTER_NAME'${NC}"
az batchai job create -n $JOB_NAME --cluster-name $CLUSTER_NAME -c job.json -g $RG -l $LOC -o table
echo
echo -e "${GRAY} You can see the progress of your job in the portal now or run the following command"
echo -e " az batchai job show -n sample-job-15935 -g $RG -o table"
echo
echo -e " Also, to see the STDERR of the job you can run the following command:"
echo -e " az batchai job file stream -n $JOB_NAME -g $RG -f stderr-job_prep.txt"
echo
echo -e " Same thing to see the STDOUT of the job:"
echo -e " az batchai job file stream -n $JOB_NAME -g $RG -f stdout-job_prep.txt"
echo
echo
}
function create_job_prep(){
cat <<EOT > job-prep.sh
#!/bin/bash
echo "Installing unzip"
apt-get update
apt-get install -y zip dos2unix
echo "Getting the sample files"
export MOUNT_PATH=/mnt/batch/tasks/shared/LS_root/mounts/$STO_FILE_SHARE/$STO_DIR
wget https://batchaisamples.blob.core.windows.net/samples/BatchAIQuickStart.zip\?st\=2017-09-29T18%3A29%3A00Z\&se\=2099-12-31T08%3A00%3A00Z\&sp\=rl\&sv\=2016-05-31\&sr\=b\&sig\=hrAZfbZC%2BQ%2FKccFQZ7OC4b%2FXSzCF5Myi4Cj%2BW3sVZDo%3D -O BatchAIQuickStart.zip
echo "Creating cntk_samples folder in \$MOUNT_PATH"
mkdir \$MOUNT_PATH/cntk_samples
echo "Unzip the file"
unzip -o BatchAIQuickStart.zip -d \$MOUNT_PATH/cntk_samples
echo "Remove the zip file"
rm -rf BatchAIQuickStart.zip
EOT
}
function write_job() {
clean_session
source $CONFIG_FILE_NAME
local AFS_DIRECTORY="\$AZ_BATCHAI_MOUNT_ROOT/$STO_FILE_SHARE/$STO_DIR"
cat <<EOT > job.json
{
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json",
"properties": {
"nodeCount": 1,
"cntkSettings": {
"pythonScriptFilePath": "\$AZ_BATCHAI_INPUT_SCRIPT/ConvNet_MNIST.py",
"commandLineArgs": "\$AZ_BATCHAI_INPUT_DATASET \$AZ_BATCHAI_OUTPUT_MODEL"
},
"stdOutErrPathPrefix": "$AFS_DIRECTORY",
"inputDirectories": [{
"id": "DATASET",
"path": "$AFS_DIRECTORY/cntk_samples"
}, {
"id": "SCRIPT",
"path": "$AFS_DIRECTORY/cntk_samples"
}],
"outputDirectories": [{
"id": "MODEL",
"pathPrefix": "$AFS_DIRECTORY",
"pathSuffix": "Models"
}],
"containerSettings": {
"imageSourceRegistry": {
"image": "microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0"
}
},
"jobPreparation": {
"commandLine": "bash $AFS_DIRECTORY/job-prep.sh"
}
}
}
EOT
}
function cluster_options() {
echo -e "${YELLOW}*****************************************************************${NC}"
echo -e "${YELLOW}**~~-- Batch AI Easy Cluster Menu --~~**${NC}"
echo -e "${YELLOW}*****************************************************************${NC}"
echo -e "${YELLOW}- Options for your cluster '$CLUSTER_NAME' ?${NC}"
echo
echo -e " ${L_BLUE}1)${NC} Print cluster information"
echo -e " ${L_BLUE}2)${NC} Obtain status of nodes and refresh SSH connection string"
echo -e " ${L_BLUE}3)${NC} Create a sample job"
echo -e " ${L_BLUE}4)${NC} Create a new cluster"
echo -e " ${L_BLUE}5)${NC} ${RED}Delete${NC} resource group with cluster and storage account"
echo
echo -e "Select an option (or type 'q' to exit):"
read selection
case $selection in
1)
print_conf_file
;;
2)
get_connection_strings
;;
3)
create_sample_job
;;
4)
echo -e "-${BLUE}Saving current file '${NEW_NAME}', and running the configuration tool${NC}"
mv $CONFIG_FILE_NAME $CLUSTER_NAME-$CONFIG_FILE_NAME
create_new_cluster
;;
5)
delete_current_resource_group
;;
*) # anything else
echo "See you!"
exit 1
;;
esac
cluster_options
}
function create_new_cluster() {
run_sub_checks
configure
deploy
check_cluster
get_connection_strings
}
main() {
clean_session
#Check if general env file exists
if [ -f $CHECKS_FILE ]; then
source $CHECKS_FILE
fi
#Check if cluster config file exists
if [ -f $CONFIG_FILE_NAME ]; then
source $CONFIG_FILE_NAME
fi
welcome
run_general_checks
#If the cluster has not been deployed, lets configure all the settings
if [ -z $CLUSTER_STATUS ]; then
create_new_cluster
fi
print_conf_file
cluster_options
}
# main() program run
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment