Last active
August 18, 2024 09:53
-
-
Save jitsejan/f3991e5be9495e17aedc16b6512bd209 to your computer and use it in GitHub Desktop.
PySpark, Docker and S3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkContext, SparkConf, SQLContext | |
conf = ( | |
SparkConf() | |
.set("spark.hadoop.fs.s3a.path.style.access", True) | |
.set("spark.hadoop.fs.s3a.access.key", profile_info.get('aws_access_key_id')) | |
.set("spark.hadoop.fs.s3a.secret.key", profile_info.get('aws_secret_access_key')) | |
.set("spark.hadoop.fs.s3a.endpoint", f"s3-{profile_info.get('region')}.amazonaws.com") | |
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") | |
.set("com.amazonaws.services.s3.enableV4", True) | |
.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true") | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from configparser import ConfigParser | |
config_object = ConfigParser() | |
config_object.read("/home/jovyan/.aws/credentials") | |
profile_info = config_object["prod"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[prod] | |
aws_access_key_id = xxxxxxyyyyyyy | |
aws_secret_access_key = zzzzzzzzyyyyyyy | |
region = eu-west-2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version: '3' | |
services: | |
jitsejan-pyspark: | |
user: root | |
privileged: true | |
image: jitsejan/pyspark-notebook | |
restart: always | |
volumes: | |
- ./notebooks:/opt/notebooks | |
- ./data:/opt/data | |
- $HOME/.aws/credentials:/home/jovyan/.aws/credentials:ro | |
environment: | |
- GRANT_SUDO=yes | |
ports: | |
- "8488:8488" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM jupyter/pyspark-notebook | |
USER root | |
# Add essential packages | |
RUN apt-get update && apt-get install -y build-essential curl git gnupg2 nano apt-transport-https software-properties-common | |
# Set locale | |
RUN apt-get update && apt-get install -y locales \ | |
&& echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \ | |
&& locale-gen | |
# Add config to Jupyter notebook | |
COPY jupyter/jupyter_notebook_config.py /home/jovyan/.jupyter/ | |
RUN chmod -R 777 /home/jovyan/ | |
# Spark libraries | |
RUN wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -P $SPARK_HOME/jars/ | |
RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar -P $SPARK_HOME/jars/ | |
USER $NB_USER | |
# Install Python requirements | |
COPY requirements.txt /home/jovyan/ | |
RUN pip install -r /home/jovyan/requirements.txt | |
# Install NLTK | |
RUN python -c "import nltk; nltk.download('popular')" | |
# Custom styling | |
RUN mkdir -p /home/jovyan/.jupyter/custom | |
COPY custom/custom.css /home/jovyan/.jupyter/custom/ | |
# NB extensions | |
RUN jupyter contrib nbextension install --user | |
RUN jupyter nbextensions_configurator enable --user | |
# Run the notebook | |
CMD ["/opt/conda/bin/jupyter", "lab", "--allow-root"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
c = get_config() | |
c.InteractiveShell.ast_node_interactivity = "all" | |
c.NotebookApp.allow_origin = '*' | |
c.NotebookApp.ip = '*' | |
c.NotebookApp.notebook_dir = '/opt/notebooks/' | |
c.NotebookApp.open_browser = False | |
c.NotebookApp.password = u'sha1:a123:345345' | |
c.NotebookApp.port = 8488 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sc = SparkContext(conf=conf).getOrCreate() | |
sqlContext = SQLContext(sc) | |
df = sqlContext.read.parquet("s3a://datalake/warehouse/platform/company_list/") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @jamesnos, you should the following to get the password you want.