Skip to content

Instantly share code, notes, and snippets.

@rjurney
Last active July 22, 2024 01:49
Show Gist options
  • Save rjurney/55827d16a4010c1c65c1bd79eade6574 to your computer and use it in GitHub Desktop.
Save rjurney/55827d16a4010c1c65c1bd79eade6574 to your computer and use it in GitHub Desktop.
Senzing Dockerfile for Python environment setup
services:
cli:
image: rjurney/pyspark-poetry:latest
container_name: cli
build:
context: .
dockerfile: Dockerfile.cli
volumes:
- .:/app
networks:
- icij
environment:
- PYSPARK_DRIVER_PYTHON=ipython
- SENZING_ENGINE_CONFIGURATION_JSON={"PIPELINE":{"CONFIGPATH":"/etc/opt/senzing","RESOURCEPATH":"/opt/senzing/g2/resources","SUPPORTPATH":"/opt/senzing/data/current"},"SQL":{"CONNECTION":"postgresql://postgres:senzing@postgres:5432:G2"}}
command: tail -f /dev/null
restart: always
tty: true
stdin_open: true
postgres:
image: postgres:latest
container_name: postgres
restart: always
# set shared memory limit when using docker-compose
shm_size: 256mb
environment:
POSTGRES_USER: postgres
PGUSER: postgres
POSTGRES_PASSWORD: senzing
POSTGRES_DB: G2
volumes:
- "./postgres.conf:/etc/postgresql/postgresql.conf"
networks:
- icij
ports:
- 5432:5432
healthcheck:
test: ["CMD-SHELL", "pg_isready", "-U", "postgres"]
interval: 10s
timeout: 5s
retries: 5
senzing-init:
image: senzing/init-postgresql:latest
container_name: senzing-init
restart: "no"
depends_on:
postgres:
condition: service_healthy
restart: true
networks:
- icij
environment:
- SENZING_ENGINE_CONFIGURATION_JSON={"PIPELINE":{"CONFIGPATH":"/etc/opt/senzing","RESOURCEPATH":"/opt/senzing/g2/resources","SUPPORTPATH":"/opt/senzing/data"},"SQL":{"CONNECTION":"postgresql://postgres:senzing@postgres:5432:G2"}}
command: "mandatory"
networks:
icij:
name: icij
driver: bridge
# Dockerfile for a Spark environment with Python 3.10. The image is based on the miniconda3 image
# and installs OpenJDK 17, Spark 3.5.1 with Hadoop 3 and Scala 2.13 and Poetry. The image then
# installs the OpenJDK 17 and the Python packages specified in the pyproject.toml file.
FROM continuumio/miniconda3
RUN apt update && \
apt-get install -y curl apt-transport-https openjdk-17-jdk-headless wget build-essential git \
autoconf automake libtool pkg-config libpq5 libpq-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-arm64
# Set environment variables
ENV SPARK_VERSION=3.5.1
ENV HADOOP_VERSION=3
ENV SCALA_VERSION=2.13
# Download and install Spark
RUN wget -O spark.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION}.tgz && \
tar -xvzf spark.tgz && \
mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION} /opt/spark && \
rm spark.tgz
ENV SPARK_HOME=/opt/spark
# Initialize the container for Senzing
ENV SENZING_ACCEPT_EULA=I_ACCEPT_THE_SENZING_EULA
ENV PATH=/opt/senzing/g2/python:/opt/senzing/g2/bin:${PATH}
ENV SENZING_DOWNLOAD_FILE=./init-container.py
ENV SENZING_VOLUME=/opt/senzing
ENV SENZING_G2_DIR=${SENZING_VOLUME}/g2
ENV SENZING_ETC_DIR=/etc/opt/senzing
ENV SENZING_DATA_DIR=${SENZING_VOLUME}/data/current
ENV SENZING_VAR_DIR=${SENZING_VOLUME}/var
ENV SENZING_RESOURCE_DIR=${SENZING_G2_DIR}/resources
ENV LD_LIBRARY_PATH=${SENZING_G2_DIR}/lib
ENV SENZING_ENABLE_POSTGRESQL=1
ENV REFRESHED_AT=2024-03-18
ENV PYTHONPATH=${SENZING_G2_DIR}/sdk/python
ENV SENZING_DATABASE_URL="postgresql://postgres:senzing@postgres:5432/G2"
ENV SENZING_ENGINE_CONFIGURATION_JSON="{\"PIPELINE\":{\"CONFIGPATH\":\"${SENZING_ETC_DIR}\",\"RESOURCEPATH\":\"${SENZING_RESOURCE_DIR}\",\"SUPPORTPATH\":\"${SENZING_DATA_DIR}\"},\"SQL\":{\"CONNECTION\":\"${SENZING_DATABASE_URL}\",\"DATABASE\":\"G2\",\"DRIVER\":\"org.postgresql.Driver\",\"HOST\":\"postgres\",\"PASSWORD\":\"senzing\",\"PORT\":\"5432\",\"PROTOCOL\":\"postgresql\",\"SCHEMA\":\"G2\",\"USER\":\"postgres\"}}"
RUN echo SENZING_ENGINE_CONFIGURATION_JSON=${SENZING_ENGINE_CONFIGURATION_JSON}
# Install Senzing Tools and add them to PATH
WORKDIR /root
RUN wget https://senzing-production-apt.s3.amazonaws.com/senzingrepo_2.0.0-1_all.deb && \
apt install ./senzingrepo_2.0.0-1_all.deb && \
apt update && \
apt install senzingapi -y
# Initialize the container volume for Senzing and setup PostgreSQL
RUN chown $(id -u):$(id -g) -R ${SENZING_VOLUME} && \
mkdir -p ${SENZING_G2_DIR} ${SENZING_ETC_DIR} ${SENZING_DATA_DIR} ${SENZING_VAR_DIR} ${SENZING_RESOURCE_DIR} && \
chmod 777 ${SENZING_G2_DIR} ${SENZING_ETC_DIR} ${SENZING_DATA_DIR} ${SENZING_VAR_DIR} ${SENZING_RESOURCE_DIR} && \
touch ${SENZING_ETC_DIR}/senzing.g2.lic && \
touch ${SENZING_ETC_DIR}/senzing.g2config.json && \
touch ${SENZING_ETC_DIR}/senzing.g2module.ini && \
touch ${SENZING_ETC_DIR}/senzing.g2resources.ini && \
touch ${SENZING_ETC_DIR}/senzing.g2license.json && \
touch ${SENZING_ETC_DIR}/senzing.g2audit.json && \
touch ${SENZING_ETC_DIR}
RUN curl -X GET \
--output ${SENZING_DOWNLOAD_FILE} \
https://raw.githubusercontent.com/Senzing/docker-init-container/main/init-container.py && \
chmod +x ${SENZING_DOWNLOAD_FILE} && \
PYTHONPATH=${PYTHONPATH} \
LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
SENZING_ENABLE_POSTGRESQL=${SENZING_ENABLE_POSTGRESQL} \
SENZING_ENGINE_CONFIGURATION_JSON=${SENZING_ENGINE_CONFIGURATION_JSON} \
${SENZING_DOWNLOAD_FILE} initialize-files \
--database-url ${SENZING_DATABASE_URL} \
--etc-dir ${SENZING_ETC_DIR} \
--g2-dir ${SENZING_G2_DIR} \
--data-dir ${SENZING_DATA_DIR} \
--var-dir ${SENZING_VAR_DIR}
# Install Python packages via Poetry
WORKDIR /app
RUN curl -sSL https://install.python-poetry.org | python3 -
ENV PATH="/root/.local/bin:$PATH"
COPY pyproject.toml poetry.lock /app/
RUN poetry config virtualenvs.create false && \
poetry config installer.max-workers 10 && \
poetry install --no-dev --no-interaction --no-ansi --no-root -vvv && \
poetry cache clear pypi --all -n
# Create a non-root user to run things
RUN useradd -m ubuntu && \
chown -R ubuntu:ubuntu /opt/senzing && \
chown ubuntu:ubuntu /app
USER ubuntu
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment