Created
November 10, 2016 15:33
-
-
Save mrchristine/26beffa259fc14beb955d6ae8f3e8384 to your computer and use it in GitHub Desktop.
Databricks REST API to deploy an Apache Spark cluster and run a remote context to execute commands on the cluster.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| IFS=$'\n' # make newlines the only separator | |
| while getopts ":p" opt; do | |
| case $opt in | |
| p) | |
| print_versions=true | |
| echo -e "Printing the spark verions and node types supported\n" | |
| ;; | |
| \?) | |
| echo -e "Deploying cluster for mwc\n" | |
| ;; | |
| esac | |
| done | |
| # Define input parameters | |
| user=`echo $DBCLOUD_USER` | |
| pass=`echo $DBCLOUD_PASSWORD` | |
| # Format: https://shardname.cloud.databricks.com | |
| shard=`echo $DBCLOUD_SHARD` | |
| if [ $print_versions ]; then | |
| node_types=`curl -s -H "Content-Type: application/json" -u $user:$pass $shard/api/2.0/clusters/list-node-types` | |
| spark_versions=`curl -s -H "Content-Type: application/json" -u $user:$pass $shard/api/2.0/clusters/spark-versions` | |
| echo $node_types | jq . | |
| echo $spark_versions | jq . | |
| exit 0 | |
| fi | |
| # Memory optimized single container cluster type | |
| node_id="r3.xlarge" | |
| # Spark auto-updating verion. Use the help versions above to choose the types and versions | |
| spark_version="2.0.x-ubuntu15.10" | |
| # Number of nodes | |
| num_nodes=1 | |
| # Define the cluster template json | |
| cluster_template=$(cat << EOF | |
| { | |
| "cluster_name": "mwc", | |
| "spark_version": "$spark_version", | |
| "node_type_id": "$node_id", | |
| "aws_attributes": { | |
| "availability": "SPOT_WITH_FALLBACK", | |
| "zone_id": "us-west-2a" | |
| }, | |
| "num_workers": "$num_nodes" | |
| } | |
| EOF | |
| ) | |
| # Launch a cluster | |
| cluster=`curl -s -H "Content-Type: application/json" -X POST -d "$cluster_template" -u $user:$pass $shard/api/2.0/clusters/create` | |
| cluster_id=`echo $cluster | jq .cluster_id | tr -d "\""` | |
| echo -e "Launched cluster id: $cluster_id \n" | |
| sleep 1 | |
| # Grab the cluster status | |
| status=`curl -s -u $user:$pass "$shard/api/2.0/clusters/get?cluster_id=$cluster_id"` | |
| state=`echo $status | jq .state | tr -d "\""` | |
| # Print the json status | |
| echo $status | |
| # Loop until the cluster is online. Default interval is 30 seconds | |
| while [ $state != "RUNNING" ]; do | |
| echo "Waiting for cluster" | |
| echo "State: $state" | |
| sleep 30 | |
| status=`curl -s -u $user:$pass "$shard/api/2.0/clusters/get?cluster_id=$cluster_id"` | |
| state=`echo $status | jq .state | tr -d "\""` | |
| done | |
| # Verify that cluster is in running state | |
| if [ $state == "RUNNING" ]; then | |
| echo "Cluster is now in running state. Continue with Execution Context" | |
| echo "Cluster state: $state" | |
| else | |
| echo "Cluster failed to be in RUNNING state. Exiting..." | |
| exit 0 | |
| fi | |
| # Run commands via execution context | |
| cid=$cluster_id | |
| # REPL Parameters | |
| ec_data=$(cat << EOF | |
| { | |
| "language": "python", | |
| "clusterId": "$cid" | |
| } | |
| EOF | |
| ) | |
| # Grab a remote REPL | |
| echo -e "Get the execution context / remote REPL\n" | |
| ec=`curl -s -H "Content-Type: application/json" -X POST -d "$ec_data" -u $user:$pass $shard/api/1.2/contexts/create` | |
| ec_id=`echo $ec | jq .id | tr -d "\""` | |
| echo $ec | |
| echo $ec_id | |
| # Run commands via execution context | |
| command_data=$(cat << EOF | |
| { | |
| "language": "python", | |
| "contextId": "$ec_id", | |
| "clusterId": "$cid", | |
| "command": "numOfRows = spark.table(\"amazon\").count(); print numOfRows" | |
| } | |
| EOF | |
| ) | |
| # Run the command and get the command id | |
| echo -e "Run the Spark Workflow\n" | |
| echo $command_data | |
| command=`curl -s -H "Content-Type: application/json" -X POST -d "$command_data" -u $user:$pass $shard/api/1.2/commands/execute` | |
| echo "Command status: " | |
| echo $command | jq . | tr -d "\"" | |
| command_id=`echo $command | jq .id | tr -d "\""` | |
| # Check the status of the command in a loop, then grab the results | |
| status=$(cat << EOF | |
| { | |
| "clusterId": "$cid", | |
| "contextId": "$ec_id", | |
| "commandId": "$command_id" | |
| } | |
| EOF | |
| ) | |
| echo -e "Check status of the command\n" | |
| echo $status | |
| status=`curl -s -H "Content-Type: application/json" -u $user:$pass "$shard/api/1.2/commands/status?clusterId=$cid&contextId=$ec_id&commandId=$command_id"` | |
| echo "status: $status" | |
| status_id=`echo $status | jq .status | tr -d "\""` | |
| echo "status_id: $status_id" | |
| while [ $status_id == "Running" ] || [ $status_id == "Queued" ]; do | |
| echo "Command executing..." | |
| sleep 5 | |
| status=`curl -s -H "Content-Type: application/json" -u $user:$pass "$shard/api/1.2/commands/status?clusterId=$cid&contextId=$ec_id&commandId=$command_id"` | |
| status_id=`echo $status | jq .status | tr -d "\""` | |
| done | |
| echo "Return: $status" | |
| echo "Returned status: $status_id" | |
| echo "Cleanup remote REPL" | |
| d_payload=$(cat << EOF | |
| { | |
| "contextId": $ec_id, | |
| "clusterId": "$cid" | |
| } | |
| EOF | |
| ) | |
| d_status=`curl -s -H "Content-Type: application/json" -X POST -d "$d_payload" -u $user:$pass $shard/api/1.2/contexts/destroy` | |
| echo -e "Destroy: $d_status\n" | |
| # Delete the cluster | |
| echo -e "Deleting cluster after sleep of 10 \n" | |
| delete_cluster=`curl -s -H "Content-Type: application/json" -X POST -d "$cluster" -u $user:$pass $shard/api/2.0/clusters/delete` | |
| echo "Delete cluster: $delete_cluster" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment