asmaier · March 5, 2021 03:43 · arpanraaj · Jun 21, 2019 · asmaier · Jun 21, 2019
diff --git a/load_parquet_s3.py b/load_parquet_s3.py
 #
 # Some constants
 #
 aws_profile = "your_profile"
 aws_region = "your_region"
 s3_bucket = "your_bucket"

 # 
 # Reading environment variables from aws credential file 
 #
 import os
 import configparser

 config = configparser.ConfigParser()
 config.read(os.path.expanduser("~/.aws/credentials"))

 access_id = config.get(aws_profile, "aws_access_key_id") 
 access_key = config.get(aws_profile, "aws_secret_access_key") 

 # 
 # Configuring pyspark
 #

 # see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895 
 # and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb
 os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

 # If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache.
 # (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148)
 import pyspark
 sc=pyspark.SparkContext()
 # see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485
 sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")

 # see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark
 hadoop_conf=sc._jsc.hadoopConfiguration()
 # see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2
 hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true")
 hadoop_conf.set("fs.s3a.access.key", access_id)
 hadoop_conf.set("fs.s3a.secret.key", access_key)

 # see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
 hadoop_conf.set("fs.s3a.connection.maximum", "100000")

 # see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
 hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")

 #
 # Downloading the parquet file
 #
 sql=pyspark.sql.SparkSession(sc)
 path = s3_bucket + "your_path"
 dataS3=sql.read.parquet("s3a://" + path)
	#
	# Some constants
	#
	aws_profile = "your_profile"
	aws_region = "your_region"
	s3_bucket = "your_bucket"

	#
	# Reading environment variables from aws credential file
	#
	import os
	import configparser

	config = configparser.ConfigParser()
	config.read(os.path.expanduser("~/.aws/credentials"))

	access_id = config.get(aws_profile, "aws_access_key_id")
	access_key = config.get(aws_profile, "aws_secret_access_key")

	#
	# Configuring pyspark
	#

	# see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895
	# and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb
	os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

	# If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache.
	# (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148)
	import pyspark
	sc=pyspark.SparkContext()
	# see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485
	sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")

	# see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark
	hadoop_conf=sc._jsc.hadoopConfiguration()
	# see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2
	hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
	hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true")
	hadoop_conf.set("fs.s3a.access.key", access_id)
	hadoop_conf.set("fs.s3a.secret.key", access_key)

	# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
	hadoop_conf.set("fs.s3a.connection.maximum", "100000")

	# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
	hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")

	#
	# Downloading the parquet file
	#
	sql=pyspark.sql.SparkSession(sc)
	path = s3_bucket + "your_path"
	dataS3=sql.read.parquet("s3a://" + path)