Created
November 30, 2023 06:50
-
-
Save Gatsby-Lee/06806761fede814ef2c131108ab0c1fc to your computer and use it in GitHub Desktop.
Sample Dockerfile to create custom EMR Image
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
# @note: To improve cache re-use in image build, adding JARs goes first. | |
# If "python dependencies installation" ( step3, step4 ) goes first, | |
# the adding JARs less likely use the intermediate cache image since step3 and step4 has changes and doesn't hit cache. | |
# | |
# references | |
# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-steps.html | |
# - https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/docker-custom-images-tag.html | |
# @note as of emr-6.9.0, the public ECR can be used "public.ecr.aws/emr-on-eks/spark/emr-6.9.0:latest" | |
## | |
# EMR 6.7.0 - Spark 3.2.1 | |
# EMR 6.11.0 - Spark 3.3.2 | |
# FAILED: decided to use the latest version of Spark 3.3.x to get the all available path for Spark 3.3.x | |
ARG EMR_IMAGE_VERSION=emr-6.7.0:latest | |
# FROM public.ecr.aws/emr-on-eks/spark/${EMR_IMAGE_VERSION} | |
FROM 895885662937.dkr.ecr.us-west-2.amazonaws.com/spark/${EMR_IMAGE_VERSION} | |
# step 0: preparation | |
USER root | |
# step 1: install os level pkg | |
RUN yum -y install zip | |
# step 2: Add required JARs | |
# @note: PERMISSION: writing to JAR_HOME requires root permission, | |
# @note: S3 protocol doesn't work with ADD. It should be changed to `virtual-host-style` OR `aws s3 cp` | |
# @note: These ARG can't be defined before "FROM". If so, then the values become empty. | |
# - ref: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact | |
ARG JAR_HOME=/usr/lib/spark/jars/ | |
ARG SPARK_VERSION=3.2.1 | |
## REQUIRED to use Kafka as Data Source. | |
# spark version related. | |
# If not exists, "pyspark.errors.exceptions.captured.AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide." | |
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/${SPARK_VERSION}/spark-sql-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME | |
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/${SPARK_VERSION}/spark-token-provider-kafka-0-10_2.12-${SPARK_VERSION}.jar $JAR_HOME | |
# Spark version is NOT related. | |
# - If not exists, "java.lang.NoClassDefFoundError: org/apache/commons/pool2/impl/GenericKeyedObjectPoolConfig" | |
ADD https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.12.0/commons-pool2-2.12.0.jar $JAR_HOME | |
# - If not exists, "java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer" | |
ADD https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.6.0/kafka-clients-3.6.0.jar $JAR_HOME | |
RUN chmod -R +r /usr/lib/spark/jars | |
# step 3: Switch back user to hadoop | |
USER hadoop:hadoop |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment