-
-
Save bskaggs/fc3c8d0d553be54e2645616236fdc8c6 to your computer and use it in GitHub Desktop.
FROM python:3.7-alpine3.8 | |
RUN apk add --no-cache \ | |
build-base \ | |
cmake \ | |
bash \ | |
jemalloc-dev \ | |
boost-dev \ | |
autoconf \ | |
zlib-dev \ | |
flex \ | |
bison | |
RUN pip install --no-cache-dir six pytest numpy cython | |
RUN pip install --no-cache-dir pandas | |
ARG ARROW_VERSION=0.12.0 | |
ARG ARROW_SHA1=2ede75769e12df972f0acdfddd53ab15d11e0ac2 | |
ARG ARROW_BUILD_TYPE=release | |
ENV ARROW_HOME=/usr/local \ | |
PARQUET_HOME=/usr/local | |
#Download and build apache-arrow | |
RUN mkdir /arrow \ | |
&& apk add --no-cache curl \ | |
&& curl -o /tmp/apache-arrow.tar.gz -SL https://github.com/apache/arrow/archive/apache-arrow-${ARROW_VERSION}.tar.gz \ | |
&& echo "$ARROW_SHA1 *apache-arrow.tar.gz" | sha1sum /tmp/apache-arrow.tar.gz \ | |
&& tar -xvf /tmp/apache-arrow.tar.gz -C /arrow --strip-components 1 \ | |
&& mkdir -p /arrow/cpp/build \ | |
&& cd /arrow/cpp/build \ | |
&& cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ | |
-DCMAKE_INSTALL_LIBDIR=lib \ | |
-DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ | |
-DARROW_PARQUET=on \ | |
-DARROW_PYTHON=on \ | |
-DARROW_PLASMA=on \ | |
-DARROW_BUILD_TESTS=OFF \ | |
.. \ | |
&& make -j$(nproc) \ | |
&& make install \ | |
&& cd /arrow/python \ | |
&& python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --with-parquet \ | |
&& python setup.py install \ | |
&& rm -rf /arrow /tmp/apache-arrow.tar.gz |
For 3.0.0 with 3.8 python
FROM python:3.8-alpine
RUN apk update \
&& apk upgrade \
&& apk add --no-cache build-base \
autoconf \
bash \
bison \
boost-dev \
cmake \
flex \
libressl-dev \
zlib-dev
RUN pip install --no-cache-dir six pytest numpy cython
RUN pip install --no-cache-dir pandas
ARG ARROW_VERSION=3.0.0
ARG ARROW_SHA1=c1fed962cddfab1966a0e03461376ebb28cf17d3
ARG ARROW_BUILD_TYPE=release
ENV ARROW_HOME=/usr/local \
PARQUET_HOME=/usr/local
#Download and build apache-arrow
RUN mkdir /arrow \
&& wget -q https://github.com/apache/arrow/archive/apache-arrow-${ARROW_VERSION}.tar.gz -O /tmp/apache-arrow.tar.gz \
&& echo "${ARROW_SHA1} *apache-arrow.tar.gz" | sha1sum /tmp/apache-arrow.tar.gz \
&& tar -xvf /tmp/apache-arrow.tar.gz -C /arrow --strip-components 1 \
&& mkdir -p /arrow/cpp/build \
&& cd /arrow/cpp/build \
&& cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \
-DOPENSSL_ROOT_DIR=/usr/local/ssl \
-DCMAKE_INSTALL_LIBDIR=lib \
-DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_PARQUET=ON \
-DARROW_PYTHON=ON \
-DARROW_PLASMA=ON \
-DARROW_BUILD_TESTS=OFF \
.. \
&& make -j$(nproc) \
&& make install \
&& cd /arrow/python \
&& python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --with-parquet \
&& python setup.py install \
&& rm -rf /arrow /tmp/apache-arrow.tar.gz
Can someone try this on python3.9-alpine?
Doesn't work for me.
I've tried a few different config:
alpine python 3.9.14
without pandas packages
but on ARROW_VERSION 9.0.0
-> failed
with pandas it was also failed, do You have a solution for the newer release of arrow?
#12 164.8 -- stderr output is:
#12 164.8 In file included from /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TSocket.cpp:37:
#12 164.8 /usr/include/sys/poll.h:1:2: warning: #warning redirecting incorrect #include <sys/poll.h> to <poll.h> [-Wcpp]
#12 164.8 1 | #warning redirecting incorrect #include <sys/poll.h> to <poll.h>
#12 164.8 | ^~~~~~~
#12 164.8 In file included from /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TServerSocket.cpp:33:
#12 164.8 /usr/include/sys/poll.h:1:2: warning: #warning redirecting incorrect #include <sys/poll.h> to <poll.h> [-Wcpp]
#12 164.8 1 | #warning redirecting incorrect #include <sys/poll.h> to <poll.h>
#12 164.8 | ^~~~~~~
#12 164.8 In file included from /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TSSLSocket.cpp:34:
#12 164.8 /usr/include/sys/poll.h:1:2: warning: #warning redirecting incorrect #include <sys/poll.h> to <poll.h> [-Wcpp]
#12 164.8 1 | #warning redirecting incorrect #include <sys/poll.h> to <poll.h>
#12 164.8 | ^~~~~~~
#12 164.8 /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TSSLSocket.cpp: In function 'void apache::thrift::transport::cleanupOpenSSL()':
#12 164.8 /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TSSLSocket.cpp:157:3: error: 'OPENSSL_thread_stop' was not declared in this scope; did you mean 'OPENSSL_realloc'?
#12 164.8 157 | OPENSSL_thread_stop();
#12 164.8 | ^~~~~~~~~~~~~~~~~~~
#12 164.8 | OPENSSL_realloc
#12 164.8 /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TSSLSocket.cpp: In member function 'virtual void apache::thrift::transport::TSSLSocket::close()':
#12 164.8 /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep/lib/cpp/src/thrift/transport/TSSLSocket.cpp:395:5: error: 'OPENSSL_thread_stop' was not declared in this scope; did you mean 'OPENSSL_realloc'?
#12 164.8 395 | OPENSSL_thread_stop();
#12 164.8 | ^~~~~~~~~~~~~~~~~~~
#12 164.8 | OPENSSL_realloc
#12 164.8 make[5]: *** [lib/cpp/CMakeFiles/thrift.dir/build.make:566: lib/cpp/CMakeFiles/thrift.dir/src/thrift/transport/TSSLSocket.cpp.o] Error 1
#12 164.8 make[5]: *** Waiting for unfinished jobs....
#12 164.8 make[4]: *** [CMakeFiles/Makefile2:125: lib/cpp/CMakeFiles/thrift.dir/all] Error 2
#12 164.8 make[3]: *** [Makefile:156: all] Error 2
#12 164.8
#12 164.8 CMake Error at /arrow/cpp/build/thrift_ep-prefix/src/thrift_ep-stamp/thrift_ep-build-RELEASE.cmake:47 (message):
#12 164.8 Stopping after outputting logs.
#12 164.8
#12 164.8
#12 164.8 make[2]: *** [CMakeFiles/thrift_ep.dir/build.make:86: thrift_ep-prefix/src/thrift_ep-stamp/thrift_ep-build] Error 1
#12 164.8 make[1]: *** [CMakeFiles/Makefile2:940: CMakeFiles/thrift_ep.dir/all] Error 2
#12 164.8 make[1]: *** Waiting for unfinished jobs....
#12 187.8 -- re2_ep build command succeeded. See also /arrow/cpp/build/re2_ep-prefix/src/re2_ep-stamp/re2_ep-build-*.log
#12 187.8 [ 20%] Performing install step for 're2_ep'
#12 188.7 -- re2_ep install command succeeded. See also /arrow/cpp/build/re2_ep-prefix/src/re2_ep-stamp/re2_ep-install-*.log
#12 188.7 [ 20%] Completed 're2_ep'
#12 188.7 [ 20%] Built target re2_ep
#12 212.4 -- jemalloc_ep build command succeeded. See also /arrow/cpp/build/jemalloc_ep-prefix/src/jemalloc_ep-stamp/jemalloc_ep-build-*.log
#12 212.4 [ 20%] Performing install step for 'jemalloc_ep'
#12 212.5 -- jemalloc_ep install command succeeded. See also /arrow/cpp/build/jemalloc_ep-prefix/src/jemalloc_ep-stamp/jemalloc_ep-install-*.log
#12 212.5 [ 20%] Completed 'jemalloc_ep'
#12 212.5 [ 20%] Built target jemalloc_ep
#12 212.5 make: *** [Makefile:146: all] Error 2
------
executor failed running [/bin/sh -c mkdir /arrow && wget -q https://github.com/apache/arrow/archive/apache-arrow-${ARROW_VERSION}.tar.gz -O /tmp/apache-arrow.tar.gz && tar -xvf /tmp/apache-arrow.tar.gz -C /arrow --strip-components 1 && mkdir -p /arrow/cpp/build && cd /arrow/cpp/build && cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE -DOPENSSL_ROOT_DIR=/usr/local/ssl -DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_INSTALL_PREFIX=$ARROW_HOME -DARROW_WITH_BZ2=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON -DARROW_WITH_LZ4=ON -DARROW_WITH_SNAPPY=ON -DARROW_PARQUET=ON -DARROW_PYTHON=ON -DARROW_PLASMA=ON -DARROW_BUILD_TESTS=OFF .. && make -j$(nproc) && make install && cd /arrow/python && python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --with-parquet && python setup.py install && rm -rf /arrow /tmp/apache-arrow.tar.gz]: exit code: 2
No, I don't. I tried a few solutions that I found by Googling, but none of them work. I'm currently totally lost.
FROM python:3.7.15-alpine3.16
RUN apk add --no-cache bash \
postgresql-dev \
gettext \
gcc \
musl-dev \
make \
cmake \
g++ \
git \
boost-dev \
flex \
bison \
zlib-dev \
autoconf \
build-base
WORKDIR /code
COPY . /code
RUN pip install --no-cache --upgrade pip wheel
RUN pip install --no-cache -r requirements.txt
RUN apk del git make cmake g++
-- Could NOT find Arrow (missing: Arrow_DIR)
-- Checking for module 'arrow'
-- Package 'arrow', required by 'virtual:world', not found
CMake Error at /usr/share/cmake/Modules/FindPackageHandleStandardArgs.cmake:230 (message):
Could NOT find Arrow (missing: ARROW_INCLUDE_DIR ARROW_LIB_DIR
ARROW_FULL_SO_VERSION ARROW_SO_VERSION)
Call Stack (most recent call first):
/usr/share/cmake/Modules/FindPackageHandleStandardArgs.cmake:594 (_FPHSA_FAILURE_MESSAGE)
cmake_modules/FindArrow.cmake:419 (find_package_handle_standard_args)
cmake_modules/FindArrowPython.cmake:46 (find_package)
CMakeLists.txt:218 (find_package)
I can't make this work either. If someone has knowledge about what the underlying problem is I will gladly put in some time and effort and try to make this work. Unfortunately, my knowledge about this thus far is quite limited. I would really like to work with the Alpine base image as it is a safe and small starting point. I am using python:3.11-alpine
as a base.
I can't make this work either. If someone has knowledge about what the underlying problem is I will gladly put in some time and effort and try to make this work. Unfortunately, my knowledge about this thus far is quite limited. I would really like to work with the Alpine base image as it is a safe and small starting point. I am using
python:3.11-alpine
as a base.
I've managed to build pyarrow with apache arrow finally, but the resulted image is 3,5GB
And building lasts about 30 min. Here is the confirmed docker file:
FROM --platform=linux/amd64 python:3.12-alpine AS base
# Setup env
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONFAULTHANDLER=1
ENV ACCEPT_EULA=Y
RUN apk update && apk add --no-cache \
gcc \
g++ \
curl \
unixodbc-dev \
bash \
libffi-dev \
openssl-dev \
cargo \
musl-dev \
postgresql-dev \
cmake \
rust \
linux-headers \
libc-dev \
libgcc \
libstdc++ \
ca-certificates \
zlib-dev \
bzip2-dev \
xz-dev \
lz4-dev \
zstd-dev \
snappy-dev \
brotli-dev \
build-base \
autoconf \
boost-dev \
flex \
libxml2-dev \
libxslt-dev \
libjpeg-turbo-dev \
ninja \
git \
&& pip install --upgrade pip && pip install pipenv cython numpy
ARG ARROW_VERSION=17.0.0
ARG ARROW_SHA256=8379554d89f19f2c8db63620721cabade62541f47a4e706dfb0a401f05a713ef
ARG ARROW_BUILD_TYPE=release
ENV ARROW_HOME=/usr/local \
PARQUET_HOME=/usr/local
RUN mkdir /arrow \
&& wget -q https://github.com/apache/arrow/archive/apache-arrow-${ARROW_VERSION}.tar.gz -O /tmp/apache-arrow.tar.gz \
&& echo "${ARROW_SHA256} *apache-arrow.tar.gz" | sha256sum /tmp/apache-arrow.tar.gz \
&& tar -xvf /tmp/apache-arrow.tar.gz -C /arrow --strip-components 1
# Create the patch file for re2
RUN echo "diff --git a/util/pcre.h b/util/pcre.h" > /arrow/re2_patch.diff \
&& echo "index e69de29..b6f3e31 100644" >> /arrow/re2_patch.diff \
&& echo "--- a/util/pcre.h" >> /arrow/re2_patch.diff \
&& echo "+++ b/util/pcre.h" >> /arrow/re2_patch.diff \
&& echo "@@ -21,6 +21,7 @@" >> /arrow/re2_patch.diff \
&& echo " #include \"re2/filtered_re2.h\"" >> /arrow/re2_patch.diff \
&& echo " #include \"re2/pod_array.h\"" >> /arrow/re2_patch.diff \
&& echo " #include \"re2/stringpiece.h\"" >> /arrow/re2_patch.diff \
&& echo "+#include <cstdint>" >> /arrow/re2_patch.diff
# Configure the build using CMake
RUN cd /arrow/cpp \
&& cmake --preset ninja-release-python
# Pre-fetch dependencies without building
RUN cd /arrow/cpp \
&& cmake --build . --target re2_ep -- -j1 || true
# Apply the patch to re2 after the dependencies are fetched but before the build
RUN cd /arrow/cpp/re2_ep-prefix/src/re2_ep \
&& patch -p1 < /arrow/re2_patch.diff
# Continue with the build and install Apache Arrow
RUN cd /arrow/cpp \
&& cmake --build . --target install \
&& rm -rf /arrow /tmp/apache-arrow.tar.gz
COPY Pipfile .
COPY Pipfile.lock .
RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy
# Final Stage
FROM base AS runtime
COPY --from=base /.venv /.venv
ENV PATH="/.venv/bin:$PATH"
WORKDIR /app
COPY src .
CMD ["python3", "main.py"]
I use pipenv to build dependencies, so you can customize the image for your needs.
or just add RUN pip install pyarrow
or update these lines from original Dockerfile above like:
RUN cd /arrow/cpp \
&& cmake --build . --target install \
&& cd /arrow/python \
&& python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --with-parquet \
&& python setup.py install \
&& rm -rf /arrow /tmp/apache-arrow.tar.gz
there is also a bug with pcre.h in apache arrow source code, so I applied patch within the Dockerimage. maybe it's not bug though, but I reported it here: #43350
I think necessary
I can't make this work either. If someone has knowledge about what the underlying problem is I will gladly put in some time and effort and try to make this work. Unfortunately, my knowledge about this thus far is quite limited. I would really like to work with the Alpine base image as it is a safe and small starting point. I am using
python:3.11-alpine
as a base.I've managed to build pyarrow with apache arrow finally, but the resulted image is 3,5GB And building lasts about 30 min. Here is the confirmed docker file:
FROM --platform=linux/amd64 python:3.12-alpine AS base # Setup env ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONFAULTHANDLER=1 ENV ACCEPT_EULA=Y RUN apk update && apk add --no-cache \ gcc \ g++ \ curl \ unixodbc-dev \ bash \ libffi-dev \ openssl-dev \ cargo \ musl-dev \ postgresql-dev \ cmake \ rust \ linux-headers \ libc-dev \ libgcc \ libstdc++ \ ca-certificates \ zlib-dev \ bzip2-dev \ xz-dev \ lz4-dev \ zstd-dev \ snappy-dev \ brotli-dev \ build-base \ autoconf \ boost-dev \ flex \ libxml2-dev \ libxslt-dev \ libjpeg-turbo-dev \ ninja \ git \ && pip install --upgrade pip && pip install pipenv cython numpy ARG ARROW_VERSION=17.0.0 ARG ARROW_SHA256=8379554d89f19f2c8db63620721cabade62541f47a4e706dfb0a401f05a713ef ARG ARROW_BUILD_TYPE=release ENV ARROW_HOME=/usr/local \ PARQUET_HOME=/usr/local RUN mkdir /arrow \ && wget -q https://github.com/apache/arrow/archive/apache-arrow-${ARROW_VERSION}.tar.gz -O /tmp/apache-arrow.tar.gz \ && echo "${ARROW_SHA256} *apache-arrow.tar.gz" | sha256sum /tmp/apache-arrow.tar.gz \ && tar -xvf /tmp/apache-arrow.tar.gz -C /arrow --strip-components 1 # Create the patch file for re2 RUN echo "diff --git a/util/pcre.h b/util/pcre.h" > /arrow/re2_patch.diff \ && echo "index e69de29..b6f3e31 100644" >> /arrow/re2_patch.diff \ && echo "--- a/util/pcre.h" >> /arrow/re2_patch.diff \ && echo "+++ b/util/pcre.h" >> /arrow/re2_patch.diff \ && echo "@@ -21,6 +21,7 @@" >> /arrow/re2_patch.diff \ && echo " #include \"re2/filtered_re2.h\"" >> /arrow/re2_patch.diff \ && echo " #include \"re2/pod_array.h\"" >> /arrow/re2_patch.diff \ && echo " #include \"re2/stringpiece.h\"" >> /arrow/re2_patch.diff \ && echo "+#include <cstdint>" >> /arrow/re2_patch.diff # Configure the build using CMake RUN cd /arrow/cpp \ && cmake --preset ninja-release-python # Pre-fetch dependencies without building RUN cd /arrow/cpp \ && cmake --build . --target re2_ep -- -j1 || true # Apply the patch to re2 after the dependencies are fetched but before the build RUN cd /arrow/cpp/re2_ep-prefix/src/re2_ep \ && patch -p1 < /arrow/re2_patch.diff # Continue with the build and install Apache Arrow RUN cd /arrow/cpp \ && cmake --build . --target install \ && rm -rf /arrow /tmp/apache-arrow.tar.gz COPY Pipfile . COPY Pipfile.lock . RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy # Final Stage FROM base AS runtime COPY --from=base /.venv /.venv ENV PATH="/.venv/bin:$PATH" WORKDIR /app COPY src . CMD ["python3", "main.py"]I use pipenv to build dependencies, so you can customize the image for your needs. or just add
RUN pip install pyarrow
or update these lines from original Dockerfile above like:RUN cd /arrow/cpp \ && cmake --build . --target install \ && cd /arrow/python \ && python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --with-parquet \ && python setup.py install \ && rm -rf /arrow /tmp/apache-arrow.tar.gz
there is also a bug with pcre.h in apache arrow source code, so I applied patch within the Dockerimage. maybe it's not bug though, but I reported it here: #43350
I think necessary reply here, because that docker help me so much. Thanks
This works to build arrow 0.15.1