Add packages to the Spark session from within python. They'll be automatically downloaded if they're not already available.
# Using `JsonSerDe` as an example.
os.environ["PYSPARK_SUBMIT_ARGS"] = (
'--packages "org.apache.hive.hcatalog.data.JsonSerDe" pyspark-shell'
)
spark = SparkSession.builder.config().getOrCreate()
Initializing a spark session for testing purposes, with S3 and Hive enabled, pointing to a mock S3 server.
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pytest
@pytest.fixture(scope="session")
def spark():
s3_endpoint = "moto"
# https://stackoverflow.com/questions/50183915/how-can-i-read-from-s3-in-pyspark-running-in-local-mode
os.environ["PYSPARK_SUBMIT_ARGS"] = (
'--packages "org.apache.hadoop:hadoop-aws:2.7.3" '
'pyspark-shell'
)
conf = (
SparkConf()
.setMaster("local[*]")
.set("spark.sql.catalogImplementation", "in-memory")
)
s = (
SparkSession
.builder
.config(conf=conf)
.enableHiveSupport()
.getOrCreate()
)
hadoop_conf = s.sparkContext._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
# Access key/secret don't matter, just as long as they're set.
hadoop_conf.set("fs.s3.access.key", "mock")
hadoop_conf.set("fs.s3.scret.key", "mock")
hadoop_conf.set("fs.s3a.endpoint", s3_endpoint)
yield s
s.stop()