Last active
January 4, 2021 17:09
-
-
Save revolutionisme/bb7eb49c6f473f7a96a518303497353e to your computer and use it in GitHub Desktop.
Script to configure an EMR cluster and launch it with a pyspark code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aws emr create-cluster --release-label emr-5.31.0 --applications Name=Spark \ | |
--instance-type m5.xlarge --instance-count 2 --service-role EMR_DefaultRole \ | |
--ec2-attributes InstanceProfile=EMR_EC2_DefaultRole --configurations https://s3.amazonaws.com/mybucket/myfolder/myConfig.json | |
# Fleet Config json examples | |
# 1. https://github.com/awsdocs/amazon-emr-management-guide/blob/master/doc_source/emr-instance-fleet.md | |
# 2. https://aws.amazon.com/blogs/aws/new-amazon-emr-instance-fleets/ | |
# 3. https://medium.com/finbox/easy-steps-to-optimise-your-aws-emr-performance-and-reduce-cost-ba4bd115973 | |
aws emr create-cluster --release-label emr-5.4.0 \ | |
--applications Name=Spark,Name=Hive,Name=Zeppelin \ | |
--service-role EMR_DefaultRole \ | |
--ec2-attributes InstanceProfile="EMR_EC2_DefaultRole,SubnetIds=[subnet-1143da3c,subnet-2e27c012]" \ | |
--instance-fleets file://my-fleet-config.json | |
# https://kulasangar.medium.com/creating-a-spark-job-using-pyspark-and-executing-it-in-aws-emr-70dba5e98a75 | |
aws emr add-steps —-cluster-id j-3H6EATEWWRWS \ | |
—-steps Type=spark, Name=ParquetConversion, \ | |
Args=[ —-deploy-mode,cluster, —-master,yarn, —-conf, spark.yarn.submit.waitAppCompletion=true, s3a://test/script/pyspark.py], ActionOnFailure=CONTINUE | |
aws emr create-cluster --name test-emr-cluster --use-default-roles --release-label emr-5.28.0 \ | |
--instance-count 3 --instance-type m5.xlarge \ | |
--applications Name=JupyterHub Name=Spark Name=Hadoop \ | |
--ec2-attributes KeyName=emr-cluster --log-uri s3://s3-for-emr-cluster/ | |
# Create cluster and add pig step | |
aws emr create-cluster --name "Test cluster" --ami-version 2.4 --applications Name=Hive Name=Pig \ | |
--use-default-roles --ec2-attributes KeyName=myKey \ | |
--instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m5.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m5.xlarge \ | |
--steps Type=PIG,Name="Pig Program",ActionOnFailure=CONTINUE,Args=[-f,s3://mybucket/scripts/pigscript.pig,-p,INPUT=s3://mybucket/inputdata/,-p,OUTPUT=s3://mybucket/outputdata/,$INPUT=s3://mybucket/inputdata/,$OUTPUT=s3://mybucket/outputdata/] | |
#Spark app with create cluster | |
#INSTANCE_TYPE=m1.small | |
INSTANCE_TYPE=r6g.xlarge | |
aws emr create-cluster --release-label emr-6.2.0 --applications Name=Spark \ | |
--instance-type $INSTANCE_TYPE --instance-count 3 \ | |
--service-role EMR_DefaultRole --ec2-attributes InstanceProfile=DATABRICKS_INSTANCE_PROFILE \ | |
--log-uri s3://test-emr-jobs-xandr-b2s/logs/ \ | |
--steps Type=spark,Name=ParquetConversion,Args=[--packages,org.apache.spark:spark-avro_2.12:3.0.1,\ | |
--deploy-mode,cluster,--master,yarn,--conf,spark.yarn.submit.waitAppCompletion=true,\ | |
s3a://test-emr-jobs-xandr-b2s/script/process_data.py],ActionOnFailure=CONTINUE \ | |
--auto-terminate \ | |
--profile $PROFILE_NAME | |
#--configurations https://s3.amazonaws.com/mybucket/myfolder/myConfig.json | |
—-steps Type=spark, Name=ParquetConversion, \ | |
Args=[ —-deploy-mode,cluster, —-master,yarn, —-conf, spark.yarn.submit.waitAppCompletion=true, s3a://test/script/pyspark.py], ActionOnFailure=CONTINUE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment