dmonllao · November 12, 2019 02:20
diff --git a/setup_preprocessing_dataproc_workflow_template.sh b/setup_preprocessing_dataproc_workflow_template.sh
 #!/bin/bash

 ## 
 # Dataproc workflow template with a 'managed cluster' in Google Cloud for serverless data pre-processing using pyspark.
 #
 # Based on this guide: https://cloud.google.com/dataproc/docs/concepts/workflows/using-workflows
 # 
 # Pre-requisites:
 # - https://hub.docker.com/r/google/cloud-sdk/ installed and set up on the machine running this script.
 # - The pre-processing file in Google Cloud Storage
 #
 # Further steps:
 # - Instantiate the job however you want:
 #   - With a HTTP request to a REST API to GC
 #   - With docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud \
 #              dataproc workflow-templates instantiate "$WORKFLOW_TEMPLATE_NAME" \
 #              --region "$REGION" --project "$PROJECT"
 ##

 USER="yourgclouduser"
 PROJECT=theprojectid
 CLUSTER_NAME=preprocessing-cluster
 WORKFLOW_TEMPLATE_NAME=preprocessing
 REGION=australia-southeast1
 PREPROCESSING_GCS_FILE="gs://path/to/preprocessing/file.py"
 PREPROCESSING_FILE_PATH_IN_CLUSTER="file:///home/$USER/data-lake/preprocessing.py"

 # Create the workflow template
 docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud dataproc workflow-templates \
    create "$WORKFLOW_TEMPLATE_NAME" \
    --project "$PROJECT" --region "$REGION"

 # Add a managed cluster
 docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud dataproc workflow-templates \
    set-managed-cluster --project "$PROJECT" "$WORKFLOW_TEMPLATE_NAME" \
    --single-node \
    --cluster-name "$CLUSTER_NAME" \
    --region "$REGION"

 # Add job to the workflow
 # TODO Copy $PREPROCESSING_GCS_FILE from Google Cloud Storage to $PREPROCESSING_FILE_PATH_IN_CLUSTER in $CLUSTER_NAME

 # Add job to the workflow
 docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud dataproc workflow-templates \
    add-job pyspark "$PREPROCESSING_FILE_PATH_IN_CLUSTER" \
    --step-id "run_preprocessing" \
    --workflow-template "$WORKFLOW_TEMPLATE_NAME" \
    --project "$PROJECT" \
    --region "$REGION" \
    -- \
    --jar="gs://spark-lib/bigquery/spark-bigquery-latest.jar"
	#!/bin/bash

	##
	# Dataproc workflow template with a 'managed cluster' in Google Cloud for serverless data pre-processing using pyspark.
	#
	# Based on this guide: https://cloud.google.com/dataproc/docs/concepts/workflows/using-workflows
	#
	# Pre-requisites:
	# - https://hub.docker.com/r/google/cloud-sdk/ installed and set up on the machine running this script.
	# - The pre-processing file in Google Cloud Storage
	#
	# Further steps:
	# - Instantiate the job however you want:
	# - With a HTTP request to a REST API to GC
	# - With docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud \
	# dataproc workflow-templates instantiate "$WORKFLOW_TEMPLATE_NAME" \
	# --region "$REGION" --project "$PROJECT"
	##

	USER="yourgclouduser"
	PROJECT=theprojectid
	CLUSTER_NAME=preprocessing-cluster
	WORKFLOW_TEMPLATE_NAME=preprocessing
	REGION=australia-southeast1
	PREPROCESSING_GCS_FILE="gs://path/to/preprocessing/file.py"
	PREPROCESSING_FILE_PATH_IN_CLUSTER="file:///home/$USER/data-lake/preprocessing.py"

	# Create the workflow template
	docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud dataproc workflow-templates \
	create "$WORKFLOW_TEMPLATE_NAME" \
	--project "$PROJECT" --region "$REGION"

	# Add a managed cluster
	docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud dataproc workflow-templates \
	set-managed-cluster --project "$PROJECT" "$WORKFLOW_TEMPLATE_NAME" \
	--single-node \
	--cluster-name "$CLUSTER_NAME" \
	--region "$REGION"

	# Add job to the workflow
	# TODO Copy $PREPROCESSING_GCS_FILE from Google Cloud Storage to $PREPROCESSING_FILE_PATH_IN_CLUSTER in $CLUSTER_NAME

	# Add job to the workflow
	docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud dataproc workflow-templates \
	add-job pyspark "$PREPROCESSING_FILE_PATH_IN_CLUSTER" \
	--step-id "run_preprocessing" \
	--workflow-template "$WORKFLOW_TEMPLATE_NAME" \
	--project "$PROJECT" \
	--region "$REGION" \
	-- \
	--jar="gs://spark-lib/bigquery/spark-bigquery-latest.jar"