Skip to content

Instantly share code, notes, and snippets.

@joshyorko
Created July 8, 2024 18:46
Show Gist options
  • Save joshyorko/89e692d7f850bd1f5e57e4c468e18d59 to your computer and use it in GitHub Desktop.
Save joshyorko/89e692d7f850bd1f5e57e4c468e18d59 to your computer and use it in GitHub Desktop.
Spark Conf 3.5 Nessie, Minio, Spark
conf = (
pyspark.SparkConf()
.setAppName("Iceberg Partitioned Data Write")
.set("spark.jars", jdbc_driver_path) # Include the JDBC driver
.set("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.91.3,software.amazon.awssdk:bundle:2.17.81,org.apache.hadoop:hadoop-aws:3.3.1") # Include Iceberg, Nessie, AWS SDK, and Hadoop AWS packages
.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions") # Corrected Spark session extensions
.set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
.set("spark.sql.catalog.nessie.uri", NESSIE_URI)
.set("spark.sql.catalog.nessie.ref", "main")
.set("spark.sql.catalog.nessie.authentication.type", "NONE")
.set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
.set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
.set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
.set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
.set("spark.hadoop.fs.s3a.endpoint", AWS_S3_ENDPOINT)
.set("spark.hadoop.fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
.set("spark.hadoop.fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
.set("spark.hadoop.fs.s3a.endpoint.region", AWS_REGION)
.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment