BeatriceMoissinac · February 8, 2023 18:09
diff --git a/Create a cluster b/Create a cluster
 // vim: syntax=shell

 $JAR=/usr/lib/spark/lib/spark-examples.jar
 $KEY=MoissinB

 # Create cluster with 1st step
 aws emr create-cluster  --profile $KEY \
                        --name "Moissinb Cluster" \
                        --release-label emr-5.10.0 \
                        --applications Name=Spark \
                        --ec2-attributes KeyName=$KEY \
                        --instance-type m3.xlarge \
                        --instance-count 3 \
                        --use-default-roles \
                        # Optional
                        --auto-terminate\ #Cluster terminate at the end of the last step
                        --steps Type=Spark,\
                                Name="My Spark Program", \
                                ActionOnFailure=CONTINUE,\
                                Args=[--class,main.scala.task.Task1,$JAR] \
                        

 # Add an execution step (run another class from the jar)
 aws emr add-steps --cluster-id j-2AXXXXXXGAPLF \
                  --steps Type=Spark,\
                          Name="Spark Program",\
                          ActionOnFailure=CONTINUE,\
                          Args=[--class,org.apache.spark.examples.SparkPi,$JAR,10]
diff --git a/Create emr-vpc b/Create emr-vpc
 // vim: syntax=shell

 # Some clusters need a VPC in order to run - These clusters have larger computing resources. See "References"

 # Start a cluster that needs a VPC
 aws emr create-cluster  --profile $KEY \
         --name "Test Spark Cluster with VPC" \
         --release-label emr-5.10.0 \
         --applications Name=Hadoop Name=Spark \
         --ec2-attributes KeyName=$KEY,SubnetId=subnet-xxxxxxx \
         --instance-type r4.4xlarge \
         --instance-count 3 \
         --use-default-roles
diff --git a/Kill a running step b/Kill a running step
 // vim: syntax=shell
    
 # Connect to Cluster via SSH then run:
 yarn application -list
 yarn application -kill application_15XXXX5_00XX
diff --git a/Notes b/Notes
	// vim: syntax=shell

	$JAR=/usr/lib/spark/lib/spark-examples.jar
	$KEY=MoissinB

	# Create cluster with 1st step
	aws emr create-cluster --profile $KEY \
	--name "Moissinb Cluster" \
	--release-label emr-5.10.0 \
	--applications Name=Spark \
	--ec2-attributes KeyName=$KEY \
	--instance-type m3.xlarge \
	--instance-count 3 \
	--use-default-roles \
	# Optional
	--auto-terminate\ #Cluster terminate at the end of the last step
	--steps Type=Spark,\
	Name="My Spark Program", \
	ActionOnFailure=CONTINUE,\
	Args=[--class,main.scala.task.Task1,$JAR] \


	# Add an execution step (run another class from the jar)
	aws emr add-steps --cluster-id j-2AXXXXXXGAPLF \
	--steps Type=Spark,\
	Name="Spark Program",\
	ActionOnFailure=CONTINUE,\
	Args=[--class,org.apache.spark.examples.SparkPi,$JAR,10]
	// vim: syntax=shell

	# Some clusters need a VPC in order to run - These clusters have larger computing resources. See "References"

	# Start a cluster that needs a VPC
	aws emr create-cluster --profile $KEY \
	--name "Test Spark Cluster with VPC" \
	--release-label emr-5.10.0 \
	--applications Name=Hadoop Name=Spark \
	--ec2-attributes KeyName=$KEY,SubnetId=subnet-xxxxxxx \
	--instance-type r4.4xlarge \
	--instance-count 3 \
	--use-default-roles
	// vim: syntax=shell

	# Connect to Cluster via SSH then run:
	yarn application -list
	yarn application -kill application_15XXXX5_00XX