- in settings, create access key id / secret access key
- then
$ aws configure # enter at least access keys
$ aws emr create-default-roles \
--region us-east-1
$ aws ec2 create-key-pair \
--region us-east-1 \
--key-name Demo \
--query 'KeyMaterial' \
--output text \
> demo.pem
$ chmod go-rwx demo.pem
$ aws emr create-cluster \
--name 'Test cluster' \
--applications \
Name=Hadoop \
Name=Hive \
--release-label emr-5.18.0 \
--instance-type m4.xlarge \
--instance-count 2 \
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
--region us-east-1 \
--use-default-roles \
--ec2-attributes KeyName=Demo
$ CLUSTER_ID=… # put here the cluster id that the previous command printed
$ aws emr describe-cluster \
--region us-east-1 \
--cluster-id "$CLUSTER_ID" \
| jq .Cluster.Status.State -r # repeat until this prints WAITING
$ MASTER="$(
aws emr describe-cluster \
--region us-east-1 \
--cluster-id "$CLUSTER_ID" \
| jq -r .Cluster.MasterPublicDnsName
)"
$ ssh -i ./demo.pem -ND 8157 "hadoop@$MASTER" # then set up socks5 proxy localhost:8157 in browser
$ open "http://$MASTER:8088/cluster"
$ aws emr ssh \
--region us-east-1 \
--cluster-id "$CLUSTER_ID" \
--key-pair-file demo.pem
[hadoop] $ curl -L -o coursier https://git.io/coursier &&
chmod +x coursier
[hadoop] $ ./coursier launch com.lihaoyi:ammonite_2.11.12:1.3.2 \
org.slf4j:slf4j-nop:1.7.25 \
-M ammonite.Main \
-- --class-based
@ import $ivy.`com.sun.jersey:jersey-client:1.9.1`, $ivy.`org.apache.spark::spark-sql:2.3.1`, $ivy.`sh.almond::ammonite-spark:0.1.1`
@ import org.apache.spark.sql._
@ val spark = {
AmmoniteSparkSession.builder()
.progressBars()
.master("yarn")
.config("spark.executor.instances", "4")
.config("spark.executor.memory", "2g")
.getOrCreate()
}
@ def sc = spark.sparkContext
@ val rdd = sc.parallelize(1 to 100000000, 100)
@ val n = rdd.map(n => (n % 10, n)).reduceByKey(_ + _).collect()
@ exit
[hadoop] $ exit
$ aws emr terminate-clusters \
--region us-east-1 \
--cluster-ids "$CLUSTER_ID"