Last active
April 18, 2020 21:09
-
-
Save jlinoff/e31fb4af23d17f8cc216f6245a8c9b23 to your computer and use it in GitHub Desktop.
Docker image that builds parquet-tools and allows you to run parquet-tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Docker image that will allow one to run parquet-tools on any platform that supports docker. | |
# | |
# Build like this: | |
# $ docker build -f Dockerfile -t jlinoff/parquet:latest . | |
# | |
# Run it like this in the directory that contains your parquet file. | |
# $ docker run -it --rm --init -h jpt --name jpt -v $(pwd):/mnt jlinoff/parquet bash -i | |
# $ docker run -it --rm --init -h jpt --name jpt -v $(pwd):/mnt jlinoff/parquet jpt --help | |
# $ docker run -it --rm --init -h jpt --name jpt -v $(pwd):/mnt jlinoff/parquet jpt column-sizes test.parquet | |
# | |
# Note that jpt is just a wrapper around the parquet-tools program. | |
FROM openjdk:latest | |
# Initial setup. | |
RUN yum update -y && \ | |
yum group install -y "Development Tools" && \ | |
yum install -y tar tree wget curl git && \ | |
yum install -y boost boost-devel | |
# Setup the build area. | |
RUN mkdir -p /opt/parquet | |
WORKDIR /opt/parquet | |
# Build and install thrift. | |
# wget -nv http://archive.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz | |
RUN curl -L http://archive.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz --output /tmp/thrift.tar.gz && \ | |
tar xzf /tmp/thrift.tar.gz && \ | |
cd thrift-0.12.0 && \ | |
./bootstrap.sh && \ | |
chmod +x ./configure && \ | |
./configure --disable-libs && \ | |
make install && \ | |
rm -f /tmp/thrift.tar.gz && \ | |
cd - | |
# Install maven. | |
RUN curl -L https://apache.mirrors.lucidnetworks.net/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz --output /tmp/maven.tar.gz && \ | |
tar xzvf /tmp/maven.tar.gz && \ | |
rm -f /tmp/maven.tar.gz && \ | |
ls -l /opt/parquet/apache-maven-3.6.3/bin/mvn | |
# Update the path environment variable so that mvn can found. | |
ENV PATH="${PATH}:/opt/parquet/apache-maven-3.6.3/bin" | |
# Verify the installation of thrift and maven. | |
RUN ls -l /opt && \ | |
ls -l /opt/parquet && \ | |
mvn --version && \ | |
thrift --version && \ | |
java --version | |
# Build parquet tools. | |
# Skipping tests is not a great idea but they take a really | |
# long time to run. They can be re-enabled once this image | |
# is stabilized. | |
RUN git clone https://github.com/apache/parquet-mr.git && \ | |
cd parquet-mr && \ | |
LC_ALL=C mvn -pl :parquet-tools -am -Plocal -DskipTests clean install && \ | |
cd - | |
# Create the jpt (joe's parquet tool interface) script. | |
RUN echo '#!/usr/bin/env bash' > /usr/local/bin/jpt && \ | |
echo 'java -jar /opt/parquet/parquet-mr/parquet-tools/target/parquet-tools-1.12.0-SNAPSHOT.jar $*' >> /usr/local/bin/jpt && \ | |
chmod a+x /usr/local/bin/jpt | |
# Set the default work directory. | |
WORKDIR /mnt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment