Last active
March 5, 2021 03:43
-
-
Save asmaier/5768c7cda3620901440a62248614bbd0 to your computer and use it in GitHub Desktop.
Pyspark script for downloading a single parquet file from Amazon S3 via the s3a protocol. It also reads the credentials from the "~/.aws/credentials", so we don't need to hardcode them. See also https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html .
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Some constants | |
# | |
aws_profile = "your_profile" | |
aws_region = "your_region" | |
s3_bucket = "your_bucket" | |
# | |
# Reading environment variables from aws credential file | |
# | |
import os | |
import configparser | |
config = configparser.ConfigParser() | |
config.read(os.path.expanduser("~/.aws/credentials")) | |
access_id = config.get(aws_profile, "aws_access_key_id") | |
access_key = config.get(aws_profile, "aws_secret_access_key") | |
# | |
# Configuring pyspark | |
# | |
# see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895 | |
# and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb | |
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell" | |
# If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache. | |
# (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148) | |
import pyspark | |
sc=pyspark.SparkContext() | |
# see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485 | |
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") | |
# see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark | |
hadoop_conf=sc._jsc.hadoopConfiguration() | |
# see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2 | |
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") | |
hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true") | |
hadoop_conf.set("fs.s3a.access.key", access_id) | |
hadoop_conf.set("fs.s3a.secret.key", access_key) | |
# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/ | |
hadoop_conf.set("fs.s3a.connection.maximum", "100000") | |
# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region | |
hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com") | |
# | |
# Downloading the parquet file | |
# | |
sql=pyspark.sql.SparkSession(sc) | |
path = s3_bucket + "your_path" | |
dataS3=sql.read.parquet("s3a://" + path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Try to use "s3a" and not "s3" .