Gatsby-Lee · November 30, 2023 06:50
diff --git a/custom_emr_image.dockerfile b/custom_emr_image.dockerfile
 ##
 # @note: To improve cache re-use in image build, adding JARs goes first.
 #        If "python dependencies installation" ( step3, step4 ) goes first,
 #            the adding JARs less likely use the intermediate cache image since step3 and step4 has changes and doesn't hit cache.
 #
 # references
 # - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-steps.html
 # - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-tag.html
 # @note as of emr-6.9.0, the public ECR can be used "public.ecr.aws/emr-on-eks/spark/emr-6.9.0:latest"
 ##
 # EMR 6.7.0  - Spark 3.2.1
 # EMR 6.11.0 - Spark 3.3.2
 # FAILED: decided to use the latest version of Spark 3.3.x to get the all available path for Spark 3.3.x
 ARG EMR_IMAGE_VERSION=emr-6.7.0:latest
 # FROM public.ecr.aws/emr-on-eks/spark/${EMR_IMAGE_VERSION}
 FROM 895885662937.dkr.ecr.us-west-2.amazonaws.com/spark/${EMR_IMAGE_VERSION}

 # step 0: preparation
 USER root
 # step 1: install os level pkg
 RUN yum -y install zip

 # step 2: Add required JARs
 # @note: PERMISSION: writing to JAR_HOME requires root permission,
 # @note: S3 protocol doesn't work with ADD. It should be changed to `virtual-host-style` OR `aws s3 cp`
 # @note: These ARG can't be defined before "FROM". If so, then the values become empty.
 #   - ref: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
 ARG JAR_HOME=/usr/lib/spark/jars/
 ARG SPARK_VERSION=3.2.1
 ## REQUIRED to use Kafka as Data Source.
 # spark version related.
 # If not exists, "pyspark.errors.exceptions.captured.AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide."
 ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/${SPARK_VERSION}/spark-sql-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
 ADD https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/${SPARK_VERSION}/spark-token-provider-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
 # Spark version is NOT related.
 # - If not exists, "java.lang.NoClassDefFoundError: org/apache/commons/pool2/impl/GenericKeyedObjectPoolConfig"
 ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar $JAR_HOME
 # - If not exists, "java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer"
 ADD https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.6.0/kafka-clients-3.6.0.jar $JAR_HOME
 RUN chmod -R +r /usr/lib/spark/jars

 # step 3: Switch back user to hadoop
 USER hadoop:hadoop
	##
	# @note: To improve cache re-use in image build, adding JARs goes first.
	# If "python dependencies installation" ( step3, step4 ) goes first,
	# the adding JARs less likely use the intermediate cache image since step3 and step4 has changes and doesn't hit cache.
	#
	# references
	# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-steps.html
	# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-tag.html
	# @note as of emr-6.9.0, the public ECR can be used "public.ecr.aws/emr-on-eks/spark/emr-6.9.0:latest"
	##
	# EMR 6.7.0 - Spark 3.2.1
	# EMR 6.11.0 - Spark 3.3.2
	# FAILED: decided to use the latest version of Spark 3.3.x to get the all available path for Spark 3.3.x
	ARG EMR_IMAGE_VERSION=emr-6.7.0:latest
	# FROM public.ecr.aws/emr-on-eks/spark/${EMR_IMAGE_VERSION}
	FROM 895885662937.dkr.ecr.us-west-2.amazonaws.com/spark/${EMR_IMAGE_VERSION}

	# step 0: preparation
	USER root
	# step 1: install os level pkg
	RUN yum -y install zip

	# step 2: Add required JARs
	# @note: PERMISSION: writing to JAR_HOME requires root permission,
	# @note: S3 protocol doesn't work with ADD. It should be changed to `virtual-host-style` OR `aws s3 cp`
	# @note: These ARG can't be defined before "FROM". If so, then the values become empty.
	# - ref: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
	ARG JAR_HOME=/usr/lib/spark/jars/
	ARG SPARK_VERSION=3.2.1
	## REQUIRED to use Kafka as Data Source.
	# spark version related.
	# If not exists, "pyspark.errors.exceptions.captured.AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide."
	ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/${SPARK_VERSION}/spark-sql-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
	ADD https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/${SPARK_VERSION}/spark-token-provider-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME
	# Spark version is NOT related.
	# - If not exists, "java.lang.NoClassDefFoundError: org/apache/commons/pool2/impl/GenericKeyedObjectPoolConfig"
	ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar $JAR_HOME
	# - If not exists, "java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer"
	ADD https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.6.0/kafka-clients-3.6.0.jar $JAR_HOME
	RUN chmod -R +r /usr/lib/spark/jars

	# step 3: Switch back user to hadoop
	USER hadoop:hadoop