Last active
July 19, 2022 22:33
-
-
Save tsailiming/c1654e73f30ef34baf02 to your computer and use it in GitHub Desktop.
Jupyter kernel.json configuration for pyspark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"There are a few ways to use Jupyter notebook with PySpark\n", | |
"\n", | |
"1. Invoking pyspark which will add a Spark and Hive Context automatically\n", | |
"1. Hardcode SPARK_HOME and PYTHONPATH but use your own context in the notebook (my preferred method)\n", | |
"1. Define everything in the notebook (UNTESTED!)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 1\n", | |
"\n", | |
"````\n", | |
"{\n", | |
" \"display_name\": \"PySpark (Python X.Y.Z)\",\n", | |
" \"language\": \"python\",\n", | |
" \"argv\": [\n", | |
" \"/path/to/python\",\n", | |
" \"-m\",\n", | |
" \"IPython.kernel\",\n", | |
" \"-f\",\n", | |
" \"{connection_file}\"\n", | |
" ],\n", | |
" \"env\": {\n", | |
" \"SPARK_HOME\": \"<spark_dir>\",\n", | |
" \"PYTHONPATH\": \"<spark_dir>/python/:<spark_dir>/python/lib/py4j-<version>-src.zip\",\n", | |
" \"PYTHONSTARTUP\": \"<spark_dir>/python/pyspark/shell.py\",\n", | |
" \"PYSPARK_SUBMIT_ARGS\": \"--master local[2] pyspark-shell\"\n", | |
" }\n", | |
"}\n", | |
"````" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\n", | |
"# 2\n", | |
"\n", | |
"* The following example has SPARK_HOME and PYTHONPATH hardcoded\n", | |
"* Change the path to point to the correct python in `argv`\n", | |
"````\n", | |
"{\n", | |
" \"display_name\": \"PySpark (Spark 1.6.0)\",\n", | |
" \"language\": \"python\",\n", | |
" \"argv\": [\n", | |
" \"/Users/ltsai/Documents/workspace/venv/smu/bin/python\",\n", | |
" \"-m\",\n", | |
" \"IPython.kernel\",\n", | |
" \"-f\",\n", | |
" \"{connection_file}\"\n", | |
" ],\n", | |
" \"env\": {\n", | |
" \"SPARK_HOME\": \"/Users/ltsai/Documents/workspace/spark-1.6.0-bin-hadoop2.6\",\n", | |
" \"PYTHONPATH\": \"/Users/ltsai/Documents/workspace/spark-1.6.0-bin-hadoop2.6/python:/Users/ltsai/Documents/workspace/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip\"\n", | |
" }\n", | |
"}\n", | |
"````\n", | |
"\n", | |
"````\n", | |
"# PYSPARK_SUBMIT_ARGS can be left out if not providing any maven packagesb\n", | |
"import os\n", | |
"os.environ['PYSPARK_SUBMIT_ARGS'] = \"--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell\"\n", | |
"\n", | |
"from pyspark import SparkContext, SparkConf\n", | |
"from pyspark.sql import SQLContext, HiveContext\n", | |
"\n", | |
"import py4j\n", | |
"\n", | |
"conf = SparkConf().setAppName(\"SparkJupyter\") \\\n", | |
" .setMaster(\"local[2]\") \n", | |
"# conf = SparkConf().setAppName(\"SparkJupyter\") \\\n", | |
"# .setMaster(\"yarn-client\") \\\n", | |
"# .set(\"spark.executor.memory\", \"512m\") \\\n", | |
"# .set(\"spark.executor.cores\", 1) \\\n", | |
"# .set(\"spark.executor.instances\", 2)\n", | |
"sc = SparkContext(conf=conf)\n", | |
"\n", | |
"try:\n", | |
" # Try to access HiveConf, it will raise exception if Hive is not added\n", | |
" sc._jvm.org.apache.hadoop.hive.conf.HiveConf()\n", | |
" sqlContext = HiveContext(sc)\n", | |
"except py4j.protocol.Py4JError:\n", | |
" sqlContext = SQLContext(sc)\n", | |
"except TypeError:\n", | |
" sqlContext = SQLContext(sc)\n", | |
"sc\n", | |
"````" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 3 \n", | |
"````\n", | |
"{\n", | |
" \"display_name\": \"PySpark\",\n", | |
" \"language\": \"python\",\n", | |
" \"argv\": [\n", | |
" \"/path/to/python\",\n", | |
" \"-m\",\n", | |
" \"IPython.kernel\",\n", | |
" \"-f\",\n", | |
" \"{connection_file}\"\n", | |
" ]\n", | |
"}\n", | |
"````\n", | |
"\n", | |
"````\n", | |
"# PYSPARK_SUBMIT_ARGS can be left out if not providing any maven packagesb\n", | |
"import os\n", | |
"import sys\n", | |
"os.environ['SPARK_HOME'] = '/path/to/spark'\n", | |
"\n", | |
"sys.path.insert(os.path.join(os.environ['SPARK_HOME'], '/python'))\n", | |
"sys.path.insert(os.path.join(os.environ['SPARK_HOME'], '/python/lib/py4j-0.9-src.zip'))\n", | |
"os.environ['PYSPARK_SUBMIT_ARGS'] = \"--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell\"\n", | |
"\n", | |
"\n", | |
"from pyspark import SparkContext, SparkConf\n", | |
"from pyspark.sql import SQLContext, HiveContext\n", | |
"\n", | |
"import py4j\n", | |
"\n", | |
"conf = SparkConf().setAppName(\"SparkJupyter\") \\\n", | |
" .setMaster(\"local[2]\") \n", | |
"# conf = SparkConf().setAppName(\"SparkJupyter\") \\\n", | |
"# .setMaster(\"yarn-client\") \\\n", | |
"# .set(\"spark.executor.memory\", \"512m\") \\\n", | |
"# .set(\"spark.executor.cores\", 1) \\\n", | |
"# .set(\"spark.executor.instances\", 2)\n", | |
"sc = SparkContext(conf=conf)\n", | |
"\n", | |
"try:\n", | |
" # Try to access HiveConf, it will raise exception if Hive is not added\n", | |
" sc._jvm.org.apache.hadoop.hive.conf.HiveConf()\n", | |
" sqlContext = HiveContext(sc)\n", | |
"except py4j.protocol.Py4JError:\n", | |
" sqlContext = SQLContext(sc)\n", | |
"except TypeError:\n", | |
" sqlContext = SQLContext(sc)\n", | |
"sc\n", | |
"````" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment