Last active
July 22, 2024 01:49
-
-
Save rjurney/55827d16a4010c1c65c1bd79eade6574 to your computer and use it in GitHub Desktop.
Senzing Dockerfile for Python environment setup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
services: | |
cli: | |
image: rjurney/pyspark-poetry:latest | |
container_name: cli | |
build: | |
context: . | |
dockerfile: Dockerfile.cli | |
volumes: | |
- .:/app | |
networks: | |
- icij | |
environment: | |
- PYSPARK_DRIVER_PYTHON=ipython | |
- SENZING_ENGINE_CONFIGURATION_JSON={"PIPELINE":{"CONFIGPATH":"/etc/opt/senzing","RESOURCEPATH":"/opt/senzing/g2/resources","SUPPORTPATH":"/opt/senzing/data/current"},"SQL":{"CONNECTION":"postgresql://postgres:senzing@postgres:5432:G2"}} | |
command: tail -f /dev/null | |
restart: always | |
tty: true | |
stdin_open: true | |
postgres: | |
image: postgres:latest | |
container_name: postgres | |
restart: always | |
# set shared memory limit when using docker-compose | |
shm_size: 256mb | |
environment: | |
POSTGRES_USER: postgres | |
PGUSER: postgres | |
POSTGRES_PASSWORD: senzing | |
POSTGRES_DB: G2 | |
volumes: | |
- "./postgres.conf:/etc/postgresql/postgresql.conf" | |
networks: | |
- icij | |
ports: | |
- 5432:5432 | |
healthcheck: | |
test: ["CMD-SHELL", "pg_isready", "-U", "postgres"] | |
interval: 10s | |
timeout: 5s | |
retries: 5 | |
senzing-init: | |
image: senzing/init-postgresql:latest | |
container_name: senzing-init | |
restart: "no" | |
depends_on: | |
postgres: | |
condition: service_healthy | |
restart: true | |
networks: | |
- icij | |
environment: | |
- SENZING_ENGINE_CONFIGURATION_JSON={"PIPELINE":{"CONFIGPATH":"/etc/opt/senzing","RESOURCEPATH":"/opt/senzing/g2/resources","SUPPORTPATH":"/opt/senzing/data"},"SQL":{"CONNECTION":"postgresql://postgres:senzing@postgres:5432:G2"}} | |
command: "mandatory" | |
networks: | |
icij: | |
name: icij | |
driver: bridge |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dockerfile for a Spark environment with Python 3.10. The image is based on the miniconda3 image | |
# and installs OpenJDK 17, Spark 3.5.1 with Hadoop 3 and Scala 2.13 and Poetry. The image then | |
# installs the OpenJDK 17 and the Python packages specified in the pyproject.toml file. | |
FROM continuumio/miniconda3 | |
RUN apt update && \ | |
apt-get install -y curl apt-transport-https openjdk-17-jdk-headless wget build-essential git \ | |
autoconf automake libtool pkg-config libpq5 libpq-dev && \ | |
apt-get clean && \ | |
rm -rf /var/lib/apt/lists/* | |
ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-arm64 | |
# Set environment variables | |
ENV SPARK_VERSION=3.5.1 | |
ENV HADOOP_VERSION=3 | |
ENV SCALA_VERSION=2.13 | |
# Download and install Spark | |
RUN wget -O spark.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION}.tgz && \ | |
tar -xvzf spark.tgz && \ | |
mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION} /opt/spark && \ | |
rm spark.tgz | |
ENV SPARK_HOME=/opt/spark | |
# Initialize the container for Senzing | |
ENV SENZING_ACCEPT_EULA=I_ACCEPT_THE_SENZING_EULA | |
ENV PATH=/opt/senzing/g2/python:/opt/senzing/g2/bin:${PATH} | |
ENV SENZING_DOWNLOAD_FILE=./init-container.py | |
ENV SENZING_VOLUME=/opt/senzing | |
ENV SENZING_G2_DIR=${SENZING_VOLUME}/g2 | |
ENV SENZING_ETC_DIR=/etc/opt/senzing | |
ENV SENZING_DATA_DIR=${SENZING_VOLUME}/data/current | |
ENV SENZING_VAR_DIR=${SENZING_VOLUME}/var | |
ENV SENZING_RESOURCE_DIR=${SENZING_G2_DIR}/resources | |
ENV LD_LIBRARY_PATH=${SENZING_G2_DIR}/lib | |
ENV SENZING_ENABLE_POSTGRESQL=1 | |
ENV REFRESHED_AT=2024-03-18 | |
ENV PYTHONPATH=${SENZING_G2_DIR}/sdk/python | |
ENV SENZING_DATABASE_URL="postgresql://postgres:senzing@postgres:5432/G2" | |
ENV SENZING_ENGINE_CONFIGURATION_JSON="{\"PIPELINE\":{\"CONFIGPATH\":\"${SENZING_ETC_DIR}\",\"RESOURCEPATH\":\"${SENZING_RESOURCE_DIR}\",\"SUPPORTPATH\":\"${SENZING_DATA_DIR}\"},\"SQL\":{\"CONNECTION\":\"${SENZING_DATABASE_URL}\",\"DATABASE\":\"G2\",\"DRIVER\":\"org.postgresql.Driver\",\"HOST\":\"postgres\",\"PASSWORD\":\"senzing\",\"PORT\":\"5432\",\"PROTOCOL\":\"postgresql\",\"SCHEMA\":\"G2\",\"USER\":\"postgres\"}}" | |
RUN echo SENZING_ENGINE_CONFIGURATION_JSON=${SENZING_ENGINE_CONFIGURATION_JSON} | |
# Install Senzing Tools and add them to PATH | |
WORKDIR /root | |
RUN wget https://senzing-production-apt.s3.amazonaws.com/senzingrepo_2.0.0-1_all.deb && \ | |
apt install ./senzingrepo_2.0.0-1_all.deb && \ | |
apt update && \ | |
apt install senzingapi -y | |
# Initialize the container volume for Senzing and setup PostgreSQL | |
RUN chown $(id -u):$(id -g) -R ${SENZING_VOLUME} && \ | |
mkdir -p ${SENZING_G2_DIR} ${SENZING_ETC_DIR} ${SENZING_DATA_DIR} ${SENZING_VAR_DIR} ${SENZING_RESOURCE_DIR} && \ | |
chmod 777 ${SENZING_G2_DIR} ${SENZING_ETC_DIR} ${SENZING_DATA_DIR} ${SENZING_VAR_DIR} ${SENZING_RESOURCE_DIR} && \ | |
touch ${SENZING_ETC_DIR}/senzing.g2.lic && \ | |
touch ${SENZING_ETC_DIR}/senzing.g2config.json && \ | |
touch ${SENZING_ETC_DIR}/senzing.g2module.ini && \ | |
touch ${SENZING_ETC_DIR}/senzing.g2resources.ini && \ | |
touch ${SENZING_ETC_DIR}/senzing.g2license.json && \ | |
touch ${SENZING_ETC_DIR}/senzing.g2audit.json && \ | |
touch ${SENZING_ETC_DIR} | |
RUN curl -X GET \ | |
--output ${SENZING_DOWNLOAD_FILE} \ | |
https://raw.githubusercontent.com/Senzing/docker-init-container/main/init-container.py && \ | |
chmod +x ${SENZING_DOWNLOAD_FILE} && \ | |
PYTHONPATH=${PYTHONPATH} \ | |
LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \ | |
SENZING_ENABLE_POSTGRESQL=${SENZING_ENABLE_POSTGRESQL} \ | |
SENZING_ENGINE_CONFIGURATION_JSON=${SENZING_ENGINE_CONFIGURATION_JSON} \ | |
${SENZING_DOWNLOAD_FILE} initialize-files \ | |
--database-url ${SENZING_DATABASE_URL} \ | |
--etc-dir ${SENZING_ETC_DIR} \ | |
--g2-dir ${SENZING_G2_DIR} \ | |
--data-dir ${SENZING_DATA_DIR} \ | |
--var-dir ${SENZING_VAR_DIR} | |
# Install Python packages via Poetry | |
WORKDIR /app | |
RUN curl -sSL https://install.python-poetry.org | python3 - | |
ENV PATH="/root/.local/bin:$PATH" | |
COPY pyproject.toml poetry.lock /app/ | |
RUN poetry config virtualenvs.create false && \ | |
poetry config installer.max-workers 10 && \ | |
poetry install --no-dev --no-interaction --no-ansi --no-root -vvv && \ | |
poetry cache clear pypi --all -n | |
# Create a non-root user to run things | |
RUN useradd -m ubuntu && \ | |
chown -R ubuntu:ubuntu /opt/senzing && \ | |
chown ubuntu:ubuntu /app | |
USER ubuntu |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment