Created
April 5, 2023 18:59
-
-
Save rmoff/1d86204b559f8ffce83be4b3206b1fa0 to your computer and use it in GitHub Desktop.
SparkContext & SparkSession
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "45ba6754", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Kernel: /opt/conda/bin/python\n", | |
"Python version: 3.9.7 | packaged by conda-forge | (default, Oct 10 2021, 15:08:54) \n", | |
"[GCC 9.4.0]\n", | |
"PySpark version: 3.2.0\n" | |
] | |
} | |
], | |
"source": [ | |
"import sys\n", | |
"import pyspark\n", | |
"print(\"Kernel:\", sys.executable)\n", | |
"print(\"Python version:\", sys.version)\n", | |
"print(\"PySpark version:\", pyspark.__version__)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c1e3db35", | |
"metadata": {}, | |
"source": [ | |
"## Spark Context and Session - no config to pick up" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "d40d2671", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.context import SparkContext\n", | |
"from pyspark import SparkFiles\n", | |
"from pyspark.sql.session import SparkSession" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "6d3747db", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sc = SparkContext('local')\n", | |
"\n", | |
"spark = SparkSession(sc)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "4bc0ab5c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('spark.master', 'local'),\n", | |
" ('spark.app.startTime', '1680720996903'),\n", | |
" ('spark.executor.id', 'driver'),\n", | |
" ('spark.app.name', 'pyspark-shell'),\n", | |
" ('spark.driver.extraJavaOptions',\n", | |
" '-Dio.netty.tryReflectionSetAccessible=true'),\n", | |
" ('spark.driver.port', '33339'),\n", | |
" ('spark.driver.host', '358d949974bd'),\n", | |
" ('spark.rdd.compress', 'True'),\n", | |
" ('spark.serializer.objectStreamReset', '100'),\n", | |
" ('spark.app.id', 'local-1680720997412'),\n", | |
" ('spark.submit.pyFiles', ''),\n", | |
" ('spark.submit.deployMode', 'client'),\n", | |
" ('spark.executor.extraJavaOptions',\n", | |
" '-Dio.netty.tryReflectionSetAccessible=true'),\n", | |
" ('spark.ui.showConsoleProgress', 'true')]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spark.sparkContext.getConf().getAll()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0d378bbc", | |
"metadata": {}, | |
"source": [ | |
"_Now restart the kernel_\n", | |
"\n", | |
"---" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f3edc135", | |
"metadata": {}, | |
"source": [ | |
"## No explicit Spark Context - picks up config as expected" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "acb1c9c3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.context import SparkContext\n", | |
"from pyspark import SparkFiles\n", | |
"from pyspark.sql.session import SparkSession" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "91015532", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"spark = (\n", | |
" SparkSession.builder.master(\"local[*]\")\n", | |
" .config(\"spark.jars.packages\", \"io.delta:delta-core_2.12:2.2.0\")\n", | |
" .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", | |
" .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", | |
" .getOrCreate()\n", | |
") " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "7afbb832", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('spark.repl.local.jars',\n", | |
" 'file:///home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-2.2.0.jar,file:///home/jovyan/.ivy2/jars/io.delta_delta-storage-2.2.0.jar,file:///home/jovyan/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar'),\n", | |
" ('spark.app.id', 'local-1680721007128'),\n", | |
" ('spark.app.startTime', '1680721006667'),\n", | |
" ('spark.files',\n", | |
" 'file:///home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-2.2.0.jar,file:///home/jovyan/.ivy2/jars/io.delta_delta-storage-2.2.0.jar,file:///home/jovyan/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar'),\n", | |
" ('spark.app.initial.file.urls',\n", | |
" 'file:///home/jovyan/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar,file:///home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-2.2.0.jar,file:///home/jovyan/.ivy2/jars/io.delta_delta-storage-2.2.0.jar'),\n", | |
" ('spark.executor.id', 'driver'),\n", | |
" ('spark.app.name', 'pyspark-shell'),\n", | |
" ('spark.driver.extraJavaOptions',\n", | |
" '-Dio.netty.tryReflectionSetAccessible=true'),\n", | |
" ('spark.app.initial.jar.urls',\n", | |
" 'spark://358d949974bd:41145/jars/io.delta_delta-core_2.12-2.2.0.jar,spark://358d949974bd:41145/jars/io.delta_delta-storage-2.2.0.jar,spark://358d949974bd:41145/jars/org.antlr_antlr4-runtime-4.8.jar'),\n", | |
" ('spark.jars.packages', 'io.delta:delta-core_2.12:2.2.0'),\n", | |
" ('spark.driver.host', '358d949974bd'),\n", | |
" ('spark.sql.warehouse.dir', 'file:/home/jovyan/spark-warehouse'),\n", | |
" ('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension'),\n", | |
" ('spark.rdd.compress', 'True'),\n", | |
" ('spark.submit.pyFiles',\n", | |
" '/home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-2.2.0.jar,/home/jovyan/.ivy2/jars/io.delta_delta-storage-2.2.0.jar,/home/jovyan/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar'),\n", | |
" ('spark.driver.port', '41145'),\n", | |
" ('spark.jars',\n", | |
" 'file:///home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-2.2.0.jar,file:///home/jovyan/.ivy2/jars/io.delta_delta-storage-2.2.0.jar,file:///home/jovyan/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar'),\n", | |
" ('spark.serializer.objectStreamReset', '100'),\n", | |
" ('spark.master', 'local[*]'),\n", | |
" ('spark.submit.deployMode', 'client'),\n", | |
" ('spark.executor.extraJavaOptions',\n", | |
" '-Dio.netty.tryReflectionSetAccessible=true'),\n", | |
" ('spark.ui.showConsoleProgress', 'true'),\n", | |
" ('spark.sql.catalog.spark_catalog',\n", | |
" 'org.apache.spark.sql.delta.catalog.DeltaCatalog')]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spark.sparkContext.getConf().getAll()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3c9f2679", | |
"metadata": {}, | |
"source": [ | |
"---\n", | |
"\n", | |
"_Now restart the kernel_\n", | |
"\n", | |
"---" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "477705da", | |
"metadata": {}, | |
"source": [ | |
"## Existing Spark Context with attempted config for the Session 💀\n", | |
"\n", | |
"_SparkContext gets implictly reused by the Spark Session and so config is ignored_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "e742546a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pyspark.context import SparkContext\n", | |
"from pyspark import SparkFiles\n", | |
"from pyspark.sql.session import SparkSession" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "74e51278", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sc = SparkContext('local')\n", | |
"\n", | |
"spark = (\n", | |
" SparkSession.builder.master(\"local[*]\")\n", | |
" .config(\"spark.jars.packages\", \"io.delta:delta-core_2.12:2.2.0\")\n", | |
" .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\")\n", | |
" .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", | |
" .getOrCreate()\n", | |
") " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "050fde8d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('spark.master', 'local'),\n", | |
" ('spark.app.startTime', '1680721019537'),\n", | |
" ('spark.executor.id', 'driver'),\n", | |
" ('spark.app.name', 'pyspark-shell'),\n", | |
" ('spark.app.id', 'local-1680721020036'),\n", | |
" ('spark.driver.extraJavaOptions',\n", | |
" '-Dio.netty.tryReflectionSetAccessible=true'),\n", | |
" ('spark.driver.host', '358d949974bd'),\n", | |
" ('spark.sql.warehouse.dir', 'file:/home/jovyan/spark-warehouse'),\n", | |
" ('spark.rdd.compress', 'True'),\n", | |
" ('spark.serializer.objectStreamReset', '100'),\n", | |
" ('spark.submit.pyFiles', ''),\n", | |
" ('spark.driver.port', '46397'),\n", | |
" ('spark.submit.deployMode', 'client'),\n", | |
" ('spark.executor.extraJavaOptions',\n", | |
" '-Dio.netty.tryReflectionSetAccessible=true'),\n", | |
" ('spark.ui.showConsoleProgress', 'true')]" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spark.sparkContext.getConf().getAll()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See also https://rmoff.net/2023/04/05/using-delta-from-pyspark-java.lang.classnotfoundexception-delta.defaultsource/