- Linux環境でMambaforgeを使って環境を構築する
- クラスター管理などはおいておいて、学習用の最低限の環境を手軽に作る
- python3.11
- pyspark3.4.0
- openjdk17
- pandas2.0.1
- pandasとの連携を確認するため
- その他、jupyterなど
# use mambaforge
mamba env create -f pyspark-env.yaml
testdata | |
# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all | |
# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all | |
### JupyterNotebooks ### | |
# gitignore template for Jupyter Notebooks | |
# website: http://jupyter.org/ | |
.ipynb_checkpoints | |
*/.ipynb_checkpoints/* | |
# IPython | |
profile_default/ | |
ipython_config.py | |
# Remove previous ipynb_checkpoints | |
# git rm -r .ipynb_checkpoints/ | |
### PyCharm+all ### | |
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider | |
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 | |
# User-specific stuff | |
.idea/**/workspace.xml | |
.idea/**/tasks.xml | |
.idea/**/usage.statistics.xml | |
.idea/**/dictionaries | |
.idea/**/shelf | |
# AWS User-specific | |
.idea/**/aws.xml | |
# Generated files | |
.idea/**/contentModel.xml | |
# Sensitive or high-churn files | |
.idea/**/dataSources/ | |
.idea/**/dataSources.ids | |
.idea/**/dataSources.local.xml | |
.idea/**/sqlDataSources.xml | |
.idea/**/dynamic.xml | |
.idea/**/uiDesigner.xml | |
.idea/**/dbnavigator.xml | |
# Gradle | |
.idea/**/gradle.xml | |
.idea/**/libraries | |
# Gradle and Maven with auto-import | |
# When using Gradle or Maven with auto-import, you should exclude module files, | |
# since they will be recreated, and may cause churn. Uncomment if using | |
# auto-import. | |
# .idea/artifacts | |
# .idea/compiler.xml | |
# .idea/jarRepositories.xml | |
# .idea/modules.xml | |
# .idea/*.iml | |
# .idea/modules | |
# *.iml | |
# *.ipr | |
# CMake | |
cmake-build-*/ | |
# Mongo Explorer plugin | |
.idea/**/mongoSettings.xml | |
# File-based project format | |
*.iws | |
# IntelliJ | |
out/ | |
# mpeltonen/sbt-idea plugin | |
.idea_modules/ | |
# JIRA plugin | |
atlassian-ide-plugin.xml | |
# Cursive Clojure plugin | |
.idea/replstate.xml | |
# SonarLint plugin | |
.idea/sonarlint/ | |
# Crashlytics plugin (for Android Studio and IntelliJ) | |
com_crashlytics_export_strings.xml | |
crashlytics.properties | |
crashlytics-build.properties | |
fabric.properties | |
# Editor-based Rest Client | |
.idea/httpRequests | |
# Android studio 3.1+ serialized cache file | |
.idea/caches/build_file_checksums.ser | |
### PyCharm+all Patch ### | |
# Ignore everything but code style settings and run configurations | |
# that are supposed to be shared within teams. | |
.idea/* | |
!.idea/codeStyles | |
!.idea/runConfigurations | |
### Python ### | |
# Byte-compiled / optimized / DLL files | |
__pycache__/ | |
*.py[cod] | |
*$py.class | |
# C extensions | |
*.so | |
# Distribution / packaging | |
.Python | |
build/ | |
develop-eggs/ | |
dist/ | |
downloads/ | |
eggs/ | |
.eggs/ | |
lib/ | |
lib64/ | |
parts/ | |
sdist/ | |
var/ | |
wheels/ | |
share/python-wheels/ | |
*.egg-info/ | |
.installed.cfg | |
*.egg | |
MANIFEST | |
# PyInstaller | |
# Usually these files are written by a python script from a template | |
# before PyInstaller builds the exe, so as to inject date/other infos into it. | |
*.manifest | |
*.spec | |
# Installer logs | |
pip-log.txt | |
pip-delete-this-directory.txt | |
# Unit test / coverage reports | |
htmlcov/ | |
.tox/ | |
.nox/ | |
.coverage | |
.coverage.* | |
.cache | |
nosetests.xml | |
coverage.xml | |
*.cover | |
*.py,cover | |
.hypothesis/ | |
.pytest_cache/ | |
cover/ | |
# Translations | |
*.mo | |
*.pot | |
# Django stuff: | |
*.log | |
local_settings.py | |
db.sqlite3 | |
db.sqlite3-journal | |
# Flask stuff: | |
instance/ | |
.webassets-cache | |
# Scrapy stuff: | |
.scrapy | |
# Sphinx documentation | |
docs/_build/ | |
# PyBuilder | |
.pybuilder/ | |
target/ | |
# Jupyter Notebook | |
# IPython | |
# pyenv | |
# For a library or package, you might want to ignore these files since the code is | |
# intended to run in multiple environments; otherwise, check them in: | |
# .python-version | |
# pipenv | |
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | |
# However, in case of collaboration, if having platform-specific dependencies or dependencies | |
# having no cross-platform support, pipenv may install dependencies that don't work, or not | |
# install all needed dependencies. | |
#Pipfile.lock | |
# poetry | |
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. | |
# This is especially recommended for binary packages to ensure reproducibility, and is more | |
# commonly ignored for libraries. | |
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control | |
#poetry.lock | |
# pdm | |
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. | |
#pdm.lock | |
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it | |
# in version control. | |
# https://pdm.fming.dev/#use-with-ide | |
.pdm.toml | |
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm | |
__pypackages__/ | |
# Celery stuff | |
celerybeat-schedule | |
celerybeat.pid | |
# SageMath parsed files | |
*.sage.py | |
# Environments | |
.env | |
.venv | |
env/ | |
venv/ | |
ENV/ | |
env.bak/ | |
venv.bak/ | |
# Spyder project settings | |
.spyderproject | |
.spyproject | |
# Rope project settings | |
.ropeproject | |
# mkdocs documentation | |
/site | |
# mypy | |
.mypy_cache/ | |
.dmypy.json | |
dmypy.json | |
# Pyre type checker | |
.pyre/ | |
# pytype static type analyzer | |
.pytype/ | |
# Cython debug symbols | |
cython_debug/ | |
# PyCharm | |
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can | |
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore | |
# and can be added to the global gitignore or merged into this file. For a more nuclear | |
# option (not recommended) you can uncomment the following to ignore the entire idea folder. | |
#.idea/ | |
### Python Patch ### | |
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration | |
poetry.toml | |
# ruff | |
.ruff_cache/ | |
# LSP config files | |
pyrightconfig.json | |
### VisualStudioCode ### | |
.vscode/* | |
!.vscode/settings.json | |
!.vscode/tasks.json | |
!.vscode/launch.json | |
!.vscode/extensions.json | |
!.vscode/*.code-snippets | |
# Local History for Visual Studio Code | |
.history/ | |
# Built Visual Studio Code Extensions | |
*.vsix | |
### VisualStudioCode Patch ### | |
# Ignore all local history of files | |
.history | |
.ionide | |
# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all |
{ | |
"cells": [ | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "e40569e7", | |
"metadata": {}, | |
"source": [ | |
"# Pyspark入門\n", | |
"\n", | |
"pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、\n", | |
"メモリに乗り切らないレベルのデータ量は処理できない。\n", | |
"また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。\n", | |
"\n", | |
"遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、\n", | |
"更にpandasとの連携・併用ができるツールとして[pyspark](https://www.databricks.com/jp/glossary/pyspark)が存在するため紹介する。\n", | |
"\n", | |
"なお、詳細は[PySpark Documentation](https://spark.apache.org/docs/latest/api/python/)なども参照のこと\n" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "c612301e", | |
"metadata": {}, | |
"source": [ | |
"## 環境の作成\n", | |
"\n", | |
"Mambaforgeをインストール済みのLinux環境を前提とする。\n", | |
"\n", | |
"- conda 23.1.0\n", | |
"- mamba 1.4.1\n", | |
"\n", | |
"以下のようなconda仮想環境作成用のyamlを用意する:\n", | |
"\n", | |
"```yaml:pyspark_env.yaml\n", | |
"# pyspark_env.yaml\n", | |
"name: pyspark-intro\n", | |
"channels:\n", | |
" - conda-forge\n", | |
"dependencies:\n", | |
" - python=3.11\n", | |
" - pyspark>=3.4\n", | |
" - openjdk=17\n", | |
" - pandas>=2.0.1\n", | |
" - pyarrow\n", | |
" # for testdata\n", | |
" - numpy\n", | |
" - seaborn\n", | |
" # jupyter\n", | |
" - jupyterlab\n", | |
"```\n", | |
"\n", | |
"※~~先日[pandasの2.0がリリースされた](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html)が、\n", | |
"deprecatedになっていた[`iteritems`](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#removal-of-prior-version-deprecations-changes)が削除された影響で、\n", | |
"pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。~~\n", | |
"`pyspark>=3.4.0`で修正済みなので、pandasも2系を使う\n", | |
"\n", | |
"\n", | |
"mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)\n", | |
"\n", | |
"```sh\n", | |
"mamba env create -f pyspark_env.yaml\n", | |
"\n", | |
"# 出来上がった環境(pyspark-intro)をactivate\n", | |
"conda activate pyspark-intro\n", | |
"# or `mamba activate pyspark-intro`\n", | |
"```" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "4290e154", | |
"metadata": {}, | |
"source": [ | |
"## 実行\n", | |
"\n", | |
"- spark sessionの起動\n", | |
"- データの読み込みと保存\n", | |
"- pandasとの連携\n", | |
"\n", | |
"あたりを簡単に確認する。\n", | |
"\n", | |
"データ処理周りの詳細は省略するが、Sparkのデータフレーム機能が[Spark SQL](https://www.databricks.com/jp/glossary/what-is-spark-sql)と対応していることから、\n", | |
"概ねSQLで出来ることようなことを実行出来ると考えればOK。(というかSQLでも書ける)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "17452c65", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)\n", | |
"\n", | |
"!rm -rf testdata && mkdir testdata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "04ef6a3d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"your 131072x1 screen size is bogus. expect trouble\n", | |
"23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)\n", | |
"23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", | |
"Setting default log level to \"WARN\".\n", | |
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", | |
"23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div>\n", | |
" <p><b>SparkSession - in-memory</b></p>\n", | |
" \n", | |
" <div>\n", | |
" <p><b>SparkContext</b></p>\n", | |
"\n", | |
" <p><a href=\"http://172.24.210.1:4040\">Spark UI</a></p>\n", | |
"\n", | |
" <dl>\n", | |
" <dt>Version</dt>\n", | |
" <dd><code>v3.4.0</code></dd>\n", | |
" <dt>Master</dt>\n", | |
" <dd><code>local[*]</code></dd>\n", | |
" <dt>AppName</dt>\n", | |
" <dd><code>LocalTest</code></dd>\n", | |
" </dl>\n", | |
" </div>\n", | |
" \n", | |
" </div>\n", | |
" " | |
], | |
"text/plain": [ | |
"<pyspark.sql.session.SparkSession at 0x7ff1d5736cd0>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from pyspark.sql.session import SparkSession\n", | |
"\n", | |
"# spark sessionの立ち上げ\n", | |
"spark = (SparkSession.builder\n", | |
" .master(\"local[*]\")\n", | |
" .appName(\"LocalTest\")\n", | |
" .config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n", | |
" .config(\"spark.executor.memory\", \"4g\") # TMP (実際はハードコートはあまり良くない)\n", | |
" .getOrCreate()\n", | |
")\n", | |
"spark" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "71bf72b6", | |
"metadata": {}, | |
"source": [ | |
"↑`config`部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)\n", | |
"\n", | |
"参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion\n", | |
"\n", | |
"なお、詳細かつ環境依存性のあるconfigは別途configファイル(`$SPARK_HOME/conf/spark-defaults.conf`)を作成するなどして設定すると良い\n", | |
"\n", | |
"コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html\n", | |
"\n", | |
"また、コンフィグファイルの置き場所の設定などに必要な環境変数`SPARK_HOME`について、condaでインストールした場合の設定方法などは[ここ](https://qiita.com/junkor-1011/items/7ec9bfaaf76568ce4a05#spark_home%E3%81%AE%E8%A8%AD%E5%AE%9A)などを参照" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "d6ac5ce7", | |
"metadata": {}, | |
"source": [ | |
"Spark Sessionを起動すると、デフォルトでは[http://localhost:4040](http://localhost:4040)に[Spark UI](http://localhost:4040)が立ち上がり、\n", | |
"実行状態などをモニタリングすることが出来る(デバッグなどにも便利)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "733b9529", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sepal_length</th>\n", | |
" <th>sepal_width</th>\n", | |
" <th>petal_length</th>\n", | |
" <th>petal_width</th>\n", | |
" <th>species</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5.1</td>\n", | |
" <td>3.5</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4.9</td>\n", | |
" <td>3.0</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>4.7</td>\n", | |
" <td>3.2</td>\n", | |
" <td>1.3</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4.6</td>\n", | |
" <td>3.1</td>\n", | |
" <td>1.5</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5.0</td>\n", | |
" <td>3.6</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>145</th>\n", | |
" <td>6.7</td>\n", | |
" <td>3.0</td>\n", | |
" <td>5.2</td>\n", | |
" <td>2.3</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>146</th>\n", | |
" <td>6.3</td>\n", | |
" <td>2.5</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1.9</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>147</th>\n", | |
" <td>6.5</td>\n", | |
" <td>3.0</td>\n", | |
" <td>5.2</td>\n", | |
" <td>2.0</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>148</th>\n", | |
" <td>6.2</td>\n", | |
" <td>3.4</td>\n", | |
" <td>5.4</td>\n", | |
" <td>2.3</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>149</th>\n", | |
" <td>5.9</td>\n", | |
" <td>3.0</td>\n", | |
" <td>5.1</td>\n", | |
" <td>1.8</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>150 rows × 5 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sepal_length sepal_width petal_length petal_width species\n", | |
"0 5.1 3.5 1.4 0.2 setosa\n", | |
"1 4.9 3.0 1.4 0.2 setosa\n", | |
"2 4.7 3.2 1.3 0.2 setosa\n", | |
"3 4.6 3.1 1.5 0.2 setosa\n", | |
"4 5.0 3.6 1.4 0.2 setosa\n", | |
".. ... ... ... ... ...\n", | |
"145 6.7 3.0 5.2 2.3 virginica\n", | |
"146 6.3 2.5 5.0 1.9 virginica\n", | |
"147 6.5 3.0 5.2 2.0 virginica\n", | |
"148 6.2 3.4 5.4 2.3 virginica\n", | |
"149 5.9 3.0 5.1 1.8 virginica\n", | |
"\n", | |
"[150 rows x 5 columns]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# テストデータの作成\n", | |
"import seaborn as sns\n", | |
"\n", | |
"iris = sns.load_dataset(\"iris\")\n", | |
"display(iris)\n", | |
"\n", | |
"iris.to_csv(\"testdata/iris.csv\", index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "60b743f0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# pysparkでcsvを読み込み\n", | |
"\n", | |
"df_iris = spark.read.csv(\n", | |
" \"testdata/iris.csv\",\n", | |
" header=True,\n", | |
" inferSchema=True,\n", | |
")\n", | |
"\n", | |
"# 遅延評価するので、カラムのスキーマのみ表示される\n", | |
"display(df_iris)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "ebf1496e", | |
"metadata": {}, | |
"source": [ | |
"(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)\n", | |
"\n", | |
"\n", | |
"※(補足):\n", | |
"\n", | |
"- 遅延評価を行う関係で、オンメモリに展開するpandasなどと異なりSparkの処理のパフォーマンスはデータソースの形式の影響をもろに受ける\n", | |
" - データ形式(csv/json/parquet/orc/RDBテーブル/...etc)や、圧縮形式(gzip/xz/snappy/zlib/...etc)、カラムに対するパーティションなど\n", | |
"- データ分析・データ処理用途であれば行指向形式であるcsvを読み込んで処理を実行するのは一般に遅く、列指向形式のparquetやorcなどの方が有利" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "e89db683", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+-------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
"| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
"| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
"| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
"| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"only showing top 5 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# showなどによって初めて評価が行われる\n", | |
"df_iris.show(5) # 5件を表示" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "3d4c8b6e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+\n", | |
"|sepal_length|sepal_width|\n", | |
"+------------+-----------+\n", | |
"| 5.1| 3.5|\n", | |
"| 4.9| 3.0|\n", | |
"| 4.7| 3.2|\n", | |
"+------------+-----------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# selectによるカラムの選択\n", | |
"(df_iris\n", | |
" .select([\"sepal_length\", \"sepal_width\"])\n", | |
" .show(3)\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "0f5df129", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+---------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
"+------------+-----------+------------+-----------+---------+\n", | |
"| 6.3| 3.3| 6.0| 2.5|virginica|\n", | |
"| 5.8| 2.7| 5.1| 1.9|virginica|\n", | |
"| 7.1| 3.0| 5.9| 2.1|virginica|\n", | |
"+------------+-----------+------------+-----------+---------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# filterによる簡単なクエリ\n", | |
"(df_iris\n", | |
" .filter(df_iris.species == \"virginica\")\n", | |
" .filter(df_iris.sepal_length > 5.0)\n", | |
" .show(3)\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "77abd749", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+-----+\n", | |
"| species|count|\n", | |
"+----------+-----+\n", | |
"| virginica| 50|\n", | |
"|versicolor| 50|\n", | |
"| setosa| 50|\n", | |
"+----------+-----+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# groupByによる集約(1)\n", | |
"(df_iris\n", | |
" .groupBy('species')\n", | |
" .count()\n", | |
" .show() \n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "a998da4d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----------+------------------+------------------+-----------------+------------------+\n", | |
"| species| avg(sepal_width)| avg(petal_width)|avg(sepal_length)| avg(petal_length)|\n", | |
"+----------+------------------+------------------+-----------------+------------------+\n", | |
"| virginica|2.9739999999999998| 2.026|6.587999999999998| 5.552|\n", | |
"|versicolor|2.7700000000000005|1.3259999999999998| 5.936| 4.26|\n", | |
"| setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|\n", | |
"+----------+------------------+------------------+-----------------+------------------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# groupByによる集約(2)\n", | |
"\n", | |
"(df_iris\n", | |
" .groupBy('species')\n", | |
" .agg({\n", | |
" 'sepal_length': 'mean',\n", | |
" 'sepal_width': 'mean',\n", | |
" 'petal_length': 'mean',\n", | |
" 'petal_width': 'mean',\n", | |
" })\n", | |
" .show()\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "30e590f4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-------+------------------+-------------------+------------------+------------------+---------+\n", | |
"|summary| sepal_length| sepal_width| petal_length| petal_width| species|\n", | |
"+-------+------------------+-------------------+------------------+------------------+---------+\n", | |
"| count| 150| 150| 150| 150| 150|\n", | |
"| mean| 5.843333333333335| 3.057333333333334|3.7580000000000027| 1.199333333333334| null|\n", | |
"| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467| null|\n", | |
"| min| 4.3| 2.0| 1.0| 0.1| setosa|\n", | |
"| max| 7.9| 4.4| 6.9| 2.5|virginica|\n", | |
"+-------+------------------+-------------------+------------------+------------------+---------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# DataFrameのサマリー\n", | |
"df_iris.describe().show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "b086ea18", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+---------+---------+-----------+\n", | |
"|namespace|tableName|isTemporary|\n", | |
"+---------+---------+-----------+\n", | |
"| | iris| true|\n", | |
"+---------+---------+-----------+\n", | |
"\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
"| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
"| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
"| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
"| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
"| 5.4| 3.9| 1.7| 0.4| setosa|\n", | |
"| 4.6| 3.4| 1.4| 0.3| setosa|\n", | |
"| 5.0| 3.4| 1.5| 0.2| setosa|\n", | |
"| 4.4| 2.9| 1.4| 0.2| setosa|\n", | |
"| 4.9| 3.1| 1.5| 0.1| setosa|\n", | |
"| 5.4| 3.7| 1.5| 0.2| setosa|\n", | |
"| 4.8| 3.4| 1.6| 0.2| setosa|\n", | |
"| 4.8| 3.0| 1.4| 0.1| setosa|\n", | |
"| 4.3| 3.0| 1.1| 0.1| setosa|\n", | |
"| 5.8| 4.0| 1.2| 0.2| setosa|\n", | |
"| 5.7| 4.4| 1.5| 0.4| setosa|\n", | |
"| 5.4| 3.9| 1.3| 0.4| setosa|\n", | |
"| 5.1| 3.5| 1.4| 0.3| setosa|\n", | |
"| 5.7| 3.8| 1.7| 0.3| setosa|\n", | |
"| 5.1| 3.8| 1.5| 0.3| setosa|\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# SQLでクエリを書くことも可能\n", | |
"df_iris.createOrReplaceTempView('iris')\n", | |
"\n", | |
"spark.sql(\"show tables\").show()\n", | |
"\n", | |
"tmp = spark.sql(\"SELECT * FROM iris WHERE species = 'setosa'\")\n", | |
"tmp.show()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "37f513cd", | |
"metadata": {}, | |
"source": [ | |
"詳細は[User Guide](https://spark.apache.org/docs/3.3.2/api/python/user_guide/index.html), [API Reference](https://spark.apache.org/docs/3.3.2/api/python/reference/index.html),\n", | |
"および[Spark SQLのリファレンス](https://spark.apache.org/docs/3.3.2/api/sql/index.html)を参照" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "2ffc3fa2", | |
"metadata": {}, | |
"source": [ | |
"次に、データの[Read/Write](https://spark.apache.org/docs/latest/sql-data-sources.html)を確認する" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "e2ee6092", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pandas.core.frame.DataFrame" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+-------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
"| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
"| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
"| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
"| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
"| 5.4| 3.9| 1.7| 0.4| setosa|\n", | |
"| 4.6| 3.4| 1.4| 0.3| setosa|\n", | |
"| 5.0| 3.4| 1.5| 0.2| setosa|\n", | |
"| 4.4| 2.9| 1.4| 0.2| setosa|\n", | |
"| 4.9| 3.1| 1.5| 0.1| setosa|\n", | |
"| 5.4| 3.7| 1.5| 0.2| setosa|\n", | |
"| 4.8| 3.4| 1.6| 0.2| setosa|\n", | |
"| 4.8| 3.0| 1.4| 0.1| setosa|\n", | |
"| 4.3| 3.0| 1.1| 0.1| setosa|\n", | |
"| 5.8| 4.0| 1.2| 0.2| setosa|\n", | |
"| 5.7| 4.4| 1.5| 0.4| setosa|\n", | |
"| 5.4| 3.9| 1.3| 0.4| setosa|\n", | |
"| 5.1| 3.5| 1.4| 0.3| setosa|\n", | |
"| 5.7| 3.8| 1.7| 0.3| setosa|\n", | |
"| 5.1| 3.8| 1.5| 0.3| setosa|\n", | |
"+------------+-----------+------------+-----------+-------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する\n", | |
"\n", | |
"display(iris.__class__) # irisはpandas dataframe\n", | |
"df = spark.createDataFrame(iris)\n", | |
"\n", | |
"display(df)\n", | |
"\n", | |
"df.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "0dc198ac", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pyspark.sql.dataframe.DataFrame" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sepal_length</th>\n", | |
" <th>sepal_width</th>\n", | |
" <th>petal_length</th>\n", | |
" <th>petal_width</th>\n", | |
" <th>species</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5.1</td>\n", | |
" <td>3.5</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4.9</td>\n", | |
" <td>3.0</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>4.7</td>\n", | |
" <td>3.2</td>\n", | |
" <td>1.3</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4.6</td>\n", | |
" <td>3.1</td>\n", | |
" <td>1.5</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5.0</td>\n", | |
" <td>3.6</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>145</th>\n", | |
" <td>6.7</td>\n", | |
" <td>3.0</td>\n", | |
" <td>5.2</td>\n", | |
" <td>2.3</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>146</th>\n", | |
" <td>6.3</td>\n", | |
" <td>2.5</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1.9</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>147</th>\n", | |
" <td>6.5</td>\n", | |
" <td>3.0</td>\n", | |
" <td>5.2</td>\n", | |
" <td>2.0</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>148</th>\n", | |
" <td>6.2</td>\n", | |
" <td>3.4</td>\n", | |
" <td>5.4</td>\n", | |
" <td>2.3</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>149</th>\n", | |
" <td>5.9</td>\n", | |
" <td>3.0</td>\n", | |
" <td>5.1</td>\n", | |
" <td>1.8</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>150 rows × 5 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sepal_length sepal_width petal_length petal_width species\n", | |
"0 5.1 3.5 1.4 0.2 setosa\n", | |
"1 4.9 3.0 1.4 0.2 setosa\n", | |
"2 4.7 3.2 1.3 0.2 setosa\n", | |
"3 4.6 3.1 1.5 0.2 setosa\n", | |
"4 5.0 3.6 1.4 0.2 setosa\n", | |
".. ... ... ... ... ...\n", | |
"145 6.7 3.0 5.2 2.3 virginica\n", | |
"146 6.3 2.5 5.0 1.9 virginica\n", | |
"147 6.5 3.0 5.2 2.0 virginica\n", | |
"148 6.2 3.4 5.4 2.3 virginica\n", | |
"149 5.9 3.0 5.1 1.8 virginica\n", | |
"\n", | |
"[150 rows x 5 columns]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# spark dataframeからpandas dataframeへの変換\n", | |
"\n", | |
"display(df.__class__) # pyspark.sql.dataframe.DataFrame\n", | |
"\n", | |
"pdf = df.toPandas()\n", | |
"display(pdf)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "ee45272f", | |
"metadata": {}, | |
"source": [ | |
"↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったら`toPandas`をして扱いやすいpandasに変換する、という使い方が出来る。" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "9d02c4e4", | |
"metadata": {}, | |
"source": [ | |
"最後に色々な形式でデータの書き込みと読み込みを行う\n", | |
"\n", | |
"先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "661fc7b0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"_SUCCESS\n", | |
"part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
"part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n" | |
] | |
} | |
], | |
"source": [ | |
"# dfはspark dataframe\n", | |
"\n", | |
"# csv.gz\n", | |
"df.write.save(\"testdata/iris_csv\", format=\"csv\")\n", | |
"\n", | |
"# formatオプションを使う代わりに、↓のように書いてもOK\n", | |
"# df.write.csv(\"testdata/iris_csv\", compression=\"gzip\")\n", | |
"\n", | |
"!ls testdata/iris_csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "a0508de1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"5.1,3.5,1.4,0.2,setosa\n", | |
"4.9,3.0,1.4,0.2,setosa\n", | |
"4.7,3.2,1.3,0.2,setosa\n", | |
"4.6,3.1,1.5,0.2,setosa\n", | |
"5.0,3.6,1.4,0.2,setosa\n", | |
"5.4,3.9,1.7,0.4,setosa\n", | |
"4.6,3.4,1.4,0.3,setosa\n", | |
"5.0,3.4,1.5,0.2,setosa\n", | |
"4.4,2.9,1.4,0.2,setosa\n", | |
"4.9,3.1,1.5,0.1,setosa\n" | |
] | |
} | |
], | |
"source": [ | |
"!head testdata/iris_csv/part-00000-*.csv" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "5a2cbc62", | |
"metadata": {}, | |
"source": [ | |
"↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。\n", | |
"(ファイルが分散して生成される)\n", | |
"\n", | |
"読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、\n", | |
"**基本は保存したディレクトリを指定して読み込む**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "5953ef8e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+---+---+---+---+----------+\n", | |
"|_c0|_c1|_c2|_c3| _c4|\n", | |
"+---+---+---+---+----------+\n", | |
"|6.3|3.3|4.7|1.6|versicolor|\n", | |
"|4.9|2.4|3.3|1.0|versicolor|\n", | |
"|6.6|2.9|4.6|1.3|versicolor|\n", | |
"+---+---+---+---+----------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df_csv = spark.read.load(\"testdata/iris_csv\", format=\"csv\")\n", | |
"\n", | |
"# formatを指定せず、↓のようにすることも可能\n", | |
"# df_csv = spark.read.csv(\"testdata/iris_csv\")\n", | |
"\n", | |
"df_csv.show(3)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "f7c66c47", | |
"metadata": {}, | |
"source": [ | |
"(↑csvだとカラム名などの情報が欠落しがちで使いづらい)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "65039664", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"_SUCCESS\n", | |
"part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
"part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n" | |
] | |
} | |
], | |
"source": [ | |
"# テキスト形式だとcsvの他にjsonなども指定可能\n", | |
"# compressionで圧縮形式を指定することも出来る\n", | |
"\n", | |
"df.write.json(\"testdata/iris_json\", compression='gzip')\n", | |
"\n", | |
"!ls testdata/iris_json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "6245160d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+----------+\n", | |
"|petal_length|petal_width|sepal_length|sepal_width| species|\n", | |
"+------------+-----------+------------+-----------+----------+\n", | |
"| 3.3| 1.0| 5.0| 2.3|versicolor|\n", | |
"| 4.2| 1.3| 5.6| 2.7|versicolor|\n", | |
"| 4.2| 1.2| 5.7| 3.0|versicolor|\n", | |
"+------------+-----------+------------+-----------+----------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# 読み込みも同様\n", | |
"\n", | |
"df_json = spark.read.json(\"testdata/iris_json\")\n", | |
"# or df_json = spark.read.load(\"testdata/iris_json\", format=\"json\")\n", | |
"\n", | |
"display(df_json)\n", | |
"df_json.show(3)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "04a92179", | |
"metadata": {}, | |
"source": [ | |
"(↑jsonだとcsvよりは型情報が保持される)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "3151765f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory\n", | |
"Scaling row group sizes to 95.00% for 8 writers\n", | |
" \r" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"_SUCCESS\n", | |
"part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
"_SUCCESS\n", | |
"part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
"part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n" | |
] | |
} | |
], | |
"source": [ | |
"# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など\n", | |
"\n", | |
"df.write.parquet(\"testdata/iris_parquet\", compression=\"snappy\")\n", | |
"!ls testdata/iris_parquet\n", | |
"\n", | |
"df.write.orc(\"testdata/iris_orc\", compression=\"zlib\")\n", | |
"!ls testdata/iris_orc" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "4cb05940", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"parquet:\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+----------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
"+------------+-----------+------------+-----------+----------+\n", | |
"| 5.0| 2.3| 3.3| 1.0|versicolor|\n", | |
"| 5.6| 2.7| 4.2| 1.3|versicolor|\n", | |
"| 5.7| 3.0| 4.2| 1.2|versicolor|\n", | |
"+------------+-----------+------------+-----------+----------+\n", | |
"only showing top 3 rows\n", | |
"\n", | |
"\n", | |
"------------------\n", | |
"\n", | |
"orc:\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+----------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
"+------------+-----------+------------+-----------+----------+\n", | |
"| 5.0| 2.3| 3.3| 1.0|versicolor|\n", | |
"| 5.6| 2.7| 4.2| 1.3|versicolor|\n", | |
"| 5.7| 3.0| 4.2| 1.2|versicolor|\n", | |
"+------------+-----------+------------+-----------+----------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# 読み込み\n", | |
"\n", | |
"# parquet\n", | |
"print(\"parquet:\")\n", | |
"df_parquet = spark.read.parquet(\"testdata/iris_parquet\")\n", | |
"display(df_parquet)\n", | |
"df_parquet.show(3)\n", | |
"\n", | |
"print(\"\\n------------------\\n\")\n", | |
"\n", | |
"# orc\n", | |
"print(\"orc:\")\n", | |
"df_orc = spark.read.orc(\"testdata/iris_orc\")\n", | |
"display(df_orc)\n", | |
"df_orc.show(3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "e2c0ff8b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\u001b[00;38;5;33m.\u001b[0m\n", | |
"├── _SUCCESS\n", | |
"├── \u001b[00;38;5;33mspecies=setosa\u001b[0m\n", | |
"│ ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"│ ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"│ └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"├── \u001b[00;38;5;33mspecies=versicolor\u001b[0m\n", | |
"│ ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"│ ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"│ ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"│ └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"└── \u001b[00;38;5;33mspecies=virginica\u001b[0m\n", | |
" ├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
" ├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
" └── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
"\n", | |
"4 directories, 11 files\n" | |
] | |
} | |
], | |
"source": [ | |
"# partitionを切って保存することも可能\n", | |
"\n", | |
"df.write.save(\n", | |
" \"testdata/iris_with_partition\",\n", | |
" format=\"parquet\",\n", | |
" compression=\"snappy\",\n", | |
" partitionBy=\"species\"\n", | |
" )\n", | |
"\n", | |
"!cd testdata/iris_with_partition && tree" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "51fcefd8", | |
"metadata": {}, | |
"source": [ | |
"カラム\"species\"の値ごとに別ディレクトリが切られてデータが保存される\n", | |
"\n", | |
"RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る(し、不適切だと悪化する)\n", | |
"\n", | |
"↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。\n", | |
"一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "4a458cb4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------+-----------+------------+-----------+---------+\n", | |
"|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
"+------------+-----------+------------+-----------+---------+\n", | |
"| 7.9| 3.8| 6.4| 2.0|virginica|\n", | |
"| 6.4| 2.8| 5.6| 2.2|virginica|\n", | |
"| 6.3| 2.8| 5.1| 1.5|virginica|\n", | |
"+------------+-----------+------------+-----------+---------+\n", | |
"only showing top 3 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# パーティションがある場合でも読み込み方法は同様\n", | |
"\n", | |
"df_partition = spark.read.load(\"testdata/iris_with_partition\")\n", | |
"\n", | |
"display(df_partition)\n", | |
"df_partition.show(3)" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "376cf4e6", | |
"metadata": {}, | |
"source": [ | |
"他にも[saveAsTable](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrameWriter.saveAsTable.html)で永続化したテーブルとして書き込みを行ったりも出来る。\n", | |
"\n", | |
"扱えるデータ保存形式も[Data Sources](https://spark.apache.org/docs/latest/sql-data-sources.html)のように、\n", | |
"他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "77eb272b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | |
" warnings.warn(\n" | |
] | |
} | |
], | |
"source": [ | |
"import pyspark.pandas as ps" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "7990e97a", | |
"metadata": {}, | |
"source": [ | |
"## (補足)pysparkにおけるpandas API\n", | |
"\n", | |
"参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html\n", | |
"\n", | |
"最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子\n", | |
"\n", | |
"なお、[Koalas](https://github.com/databricks/koalas)という、sparkをpandasっぽく使えるツールが存在するが、\n", | |
"[ここ](https://learn.microsoft.com/ja-jp/azure/databricks/archive/legacy/koalas)などを見るとこれが正式にpysparkに取り込まれた?様子。" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "27553982", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.\n", | |
" warnings.warn(message, PandasAPIOnSparkAdviceWarning)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"pyspark.pandas.frame.DataFrame" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sepal_length</th>\n", | |
" <th>sepal_width</th>\n", | |
" <th>petal_length</th>\n", | |
" <th>petal_width</th>\n", | |
" <th>species</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5.1</td>\n", | |
" <td>3.5</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4.9</td>\n", | |
" <td>3.0</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>4.7</td>\n", | |
" <td>3.2</td>\n", | |
" <td>1.3</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4.6</td>\n", | |
" <td>3.1</td>\n", | |
" <td>1.5</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5.0</td>\n", | |
" <td>3.6</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sepal_length sepal_width petal_length petal_width species\n", | |
"0 5.1 3.5 1.4 0.2 setosa\n", | |
"1 4.9 3.0 1.4 0.2 setosa\n", | |
"2 4.7 3.2 1.3 0.2 setosa\n", | |
"3 4.6 3.1 1.5 0.2 setosa\n", | |
"4 5.0 3.6 1.4 0.2 setosa" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pyspark.pandas as ps\n", | |
"\n", | |
"# pandas on spark DataFrameを作成する\n", | |
"psdf = ps.read_csv(\"testdata/iris.csv\")\n", | |
"\n", | |
"display(psdf.__class__) # pyspark.pandas.frame.DataFrame\n", | |
"\n", | |
"psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "5dda2325", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sepal_length</th>\n", | |
" <th>sepal_width</th>\n", | |
" <th>petal_length</th>\n", | |
" <th>petal_width</th>\n", | |
" <th>species</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>105</th>\n", | |
" <td>7.6</td>\n", | |
" <td>3.0</td>\n", | |
" <td>6.6</td>\n", | |
" <td>2.1</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>117</th>\n", | |
" <td>7.7</td>\n", | |
" <td>3.8</td>\n", | |
" <td>6.7</td>\n", | |
" <td>2.2</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>118</th>\n", | |
" <td>7.7</td>\n", | |
" <td>2.6</td>\n", | |
" <td>6.9</td>\n", | |
" <td>2.3</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>122</th>\n", | |
" <td>7.7</td>\n", | |
" <td>2.8</td>\n", | |
" <td>6.7</td>\n", | |
" <td>2.0</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>131</th>\n", | |
" <td>7.9</td>\n", | |
" <td>3.8</td>\n", | |
" <td>6.4</td>\n", | |
" <td>2.0</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>135</th>\n", | |
" <td>7.7</td>\n", | |
" <td>3.0</td>\n", | |
" <td>6.1</td>\n", | |
" <td>2.3</td>\n", | |
" <td>virginica</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sepal_length sepal_width petal_length petal_width species\n", | |
"105 7.6 3.0 6.6 2.1 virginica\n", | |
"117 7.7 3.8 6.7 2.2 virginica\n", | |
"118 7.7 2.6 6.9 2.3 virginica\n", | |
"122 7.7 2.8 6.7 2.0 virginica\n", | |
"131 7.9 3.8 6.4 2.0 virginica\n", | |
"135 7.7 3.0 6.1 2.3 virginica" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# filterや表示も通常のpandasのようにできている\n", | |
"psdf[psdf['sepal_length'] > 7.5]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "a33ed92b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----+------------+-----------+------------+-----------+-------+\n", | |
"|index|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
"+-----+------------+-----------+------------+-----------+-------+\n", | |
"| 0| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
"| 1| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
"| 2| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
"| 3| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
"| 4| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
"+-----+------------+-----------+------------+-----------+-------+\n", | |
"only showing top 5 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# pysparkのDataFrameに変換\n", | |
"\n", | |
"sdf = psdf.to_spark(index_col=[\"index\"])\n", | |
"\n", | |
"sdf.show(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "69a33023", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.\n", | |
" warnings.warn(message, PandasAPIOnSparkAdviceWarning)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"pandas.core.frame.DataFrame" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>sepal_length</th>\n", | |
" <th>sepal_width</th>\n", | |
" <th>petal_length</th>\n", | |
" <th>petal_width</th>\n", | |
" <th>species</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5.1</td>\n", | |
" <td>3.5</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4.9</td>\n", | |
" <td>3.0</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>4.7</td>\n", | |
" <td>3.2</td>\n", | |
" <td>1.3</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4.6</td>\n", | |
" <td>3.1</td>\n", | |
" <td>1.5</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5.0</td>\n", | |
" <td>3.6</td>\n", | |
" <td>1.4</td>\n", | |
" <td>0.2</td>\n", | |
" <td>setosa</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" sepal_length sepal_width petal_length petal_width species\n", | |
"0 5.1 3.5 1.4 0.2 setosa\n", | |
"1 4.9 3.0 1.4 0.2 setosa\n", | |
"2 4.7 3.2 1.3 0.2 setosa\n", | |
"3 4.6 3.1 1.5 0.2 setosa\n", | |
"4 5.0 3.6 1.4 0.2 setosa" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 普通のpandasに変換\n", | |
"pdf = psdf.to_pandas()\n", | |
"\n", | |
"display(pdf.__class__) # pandas.core.frame.DataFrame\n", | |
"\n", | |
"pdf.head()" | |
] | |
}, | |
{ | |
"attachments": {}, | |
"cell_type": "markdown", | |
"id": "cfb09aa9", | |
"metadata": {}, | |
"source": [ | |
"## (参考) [dask](https://www.dask.org/)\n", | |
"\n", | |
"pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。\n", | |
"\n", | |
"例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、\n", | |
"データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。\n", | |
"\n", | |
"pysparkはdaskに比べると、\n", | |
"\n", | |
"- クラスターなどを組める分、より本格的なビッグデータにも対応出来る\n", | |
" - (今回紹介するようなやり方で、ライトに使うことも可能)\n", | |
"- Javaをバックエンドに動くことに加え、本格的に動かす場合はクラスター構築などが必要になるため、そのレベルでやると動作環境の構築のハードルは高い\n", | |
"- pandas、numpyとの親和性はdaskの方が~~高い~~高かった\n", | |
" - daskは遅延評価される以外は同じメソッドをそのまま使えるケースが多い\n", | |
" - sparkはむしろSQLと互換性がある\n", | |
" - ただ、先述の通りpandas APIが整備されつつあるので、pysparkもある程度pandasの使い方そのままで使えるようになっている模様" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、 メモリに乗り切らないレベルのデータ量は処理できない。 また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。
遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、 更にpandasとの連携・併用ができるツールとしてpysparkが存在するため紹介する。
なお、詳細はPySpark Documentationなども参照のこと
Mambaforgeをインストール済みのLinux環境を前提とする。
以下のようなconda仮想環境作成用のyamlを用意する:
# pyspark_env.yaml
name: pyspark-intro
channels:
- conda-forge
dependencies:
- python=3.11
- pyspark>=3.4
- openjdk=17
- pandas>=2.0.1
- pyarrow
# for testdata
- numpy
- seaborn
# jupyter
- jupyterlab
※先日pandasの2.0がリリースされたが、
deprecatedになっていた
iteritems
が削除された影響で、
pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。pyspark>=3.4.0
で修正済みなので、pandasも2系を使う
mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)
mamba env create -f pyspark_env.yaml
# 出来上がった環境(pyspark-intro)をactivate
conda activate pyspark-intro
# or `mamba activate pyspark-intro`
あたりを簡単に確認する。
データ処理周りの詳細は省略するが、Sparkのデータフレーム機能がSpark SQLと対応していることから、 概ねSQLで出来ることようなことを実行出来ると考えればOK。(というかSQLでも書ける)
# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)
!rm -rf testdata && mkdir testdata
from pyspark.sql.session import SparkSession
# spark sessionの立ち上げ
spark = (SparkSession.builder
.master("local[*]")
.appName("LocalTest")
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.config("spark.executor.memory", "4g") # TMP (実際はハードコートはあまり良くない)
.getOrCreate()
)
spark
your 131072x1 screen size is bogus. expect trouble
23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)
23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
<div>
<p><b>SparkSession - in-memory</b></p>
SparkContext
<p><a href="http://172.24.210.1:4040">Spark UI</a></p>
<dl>
<dt>Version</dt>
<dd><code>v3.4.0</code></dd>
<dt>Master</dt>
<dd><code>local[*]</code></dd>
<dt>AppName</dt>
<dd><code>LocalTest</code></dd>
</dl>
</div>
↑config
部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)
参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion
なお、詳細かつ環境依存性のあるconfigは別途configファイル($SPARK_HOME/conf/spark-defaults.conf
)を作成するなどして設定すると良い
コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html
また、コンフィグファイルの置き場所の設定などに必要な環境変数SPARK_HOME
について、condaでインストールした場合の設定方法などはここなどを参照
Spark Sessionを起動すると、デフォルトではhttp://localhost:4040にSpark UIが立ち上がり、 実行状態などをモニタリングすることが出来る(デバッグなどにも便利)
# テストデータの作成
import seaborn as sns
iris = sns.load_dataset("iris")
display(iris)
iris.to_csv("testdata/iris.csv", index=False)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
# pysparkでcsvを読み込み
df_iris = spark.read.csv(
"testdata/iris.csv",
header=True,
inferSchema=True,
)
# 遅延評価するので、カラムのスキーマのみ表示される
display(df_iris)
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)
※(補足):
# showなどによって初めて評価が行われる
df_iris.show(5) # 5件を表示
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# selectによるカラムの選択
(df_iris
.select(["sepal_length", "sepal_width"])
.show(3)
)
+------------+-----------+
|sepal_length|sepal_width|
+------------+-----------+
| 5.1| 3.5|
| 4.9| 3.0|
| 4.7| 3.2|
+------------+-----------+
only showing top 3 rows
# filterによる簡単なクエリ
(df_iris
.filter(df_iris.species == "virginica")
.filter(df_iris.sepal_length > 5.0)
.show(3)
)
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 6.3| 3.3| 6.0| 2.5|virginica|
| 5.8| 2.7| 5.1| 1.9|virginica|
| 7.1| 3.0| 5.9| 2.1|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
# groupByによる集約(1)
(df_iris
.groupBy('species')
.count()
.show()
)
+----------+-----+
| species|count|
+----------+-----+
| virginica| 50|
|versicolor| 50|
| setosa| 50|
+----------+-----+
# groupByによる集約(2)
(df_iris
.groupBy('species')
.agg({
'sepal_length': 'mean',
'sepal_width': 'mean',
'petal_length': 'mean',
'petal_width': 'mean',
})
.show()
)
+----------+------------------+------------------+-----------------+------------------+
| species| avg(sepal_width)| avg(petal_width)|avg(sepal_length)| avg(petal_length)|
+----------+------------------+------------------+-----------------+------------------+
| virginica|2.9739999999999998| 2.026|6.587999999999998| 5.552|
|versicolor|2.7700000000000005|1.3259999999999998| 5.936| 4.26|
| setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|
+----------+------------------+------------------+-----------------+------------------+
# DataFrameのサマリー
df_iris.describe().show()
23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-------+------------------+-------------------+------------------+------------------+---------+
|summary| sepal_length| sepal_width| petal_length| petal_width| species|
+-------+------------------+-------------------+------------------+------------------+---------+
| count| 150| 150| 150| 150| 150|
| mean| 5.843333333333335| 3.057333333333334|3.7580000000000027| 1.199333333333334| null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467| null|
| min| 4.3| 2.0| 1.0| 0.1| setosa|
| max| 7.9| 4.4| 6.9| 2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+
# SQLでクエリを書くことも可能
df_iris.createOrReplaceTempView('iris')
spark.sql("show tables").show()
tmp = spark.sql("SELECT * FROM iris WHERE species = 'setosa'")
tmp.show()
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
| | iris| true|
+---------+---------+-----------+
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
詳細はUser Guide, API Reference, およびSpark SQLのリファレンスを参照
次に、データのRead/Writeを確認する
# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する
display(iris.__class__) # irisはpandas dataframe
df = spark.createDataFrame(iris)
display(df)
df.show()
pandas.core.frame.DataFrame
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
# spark dataframeからpandas dataframeへの変換
display(df.__class__) # pyspark.sql.dataframe.DataFrame
pdf = df.toPandas()
display(pdf)
pyspark.sql.dataframe.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったらtoPandas
をして扱いやすいpandasに変換する、という使い方が出来る。
最後に色々な形式でデータの書き込みと読み込みを行う
先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。
# dfはspark dataframe
# csv.gz
df.write.save("testdata/iris_csv", format="csv")
# formatオプションを使う代わりに、↓のように書いてもOK
# df.write.csv("testdata/iris_csv", compression="gzip")
!ls testdata/iris_csv
_SUCCESS
part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
!head testdata/iris_csv/part-00000-*.csv
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。 (ファイルが分散して生成される)
読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、 基本は保存したディレクトリを指定して読み込む
df_csv = spark.read.load("testdata/iris_csv", format="csv")
# formatを指定せず、↓のようにすることも可能
# df_csv = spark.read.csv("testdata/iris_csv")
df_csv.show(3)
+---+---+---+---+----------+
|_c0|_c1|_c2|_c3| _c4|
+---+---+---+---+----------+
|6.3|3.3|4.7|1.6|versicolor|
|4.9|2.4|3.3|1.0|versicolor|
|6.6|2.9|4.6|1.3|versicolor|
+---+---+---+---+----------+
only showing top 3 rows
(↑csvだとカラム名などの情報が欠落しがちで使いづらい)
# テキスト形式だとcsvの他にjsonなども指定可能
# compressionで圧縮形式を指定することも出来る
df.write.json("testdata/iris_json", compression='gzip')
!ls testdata/iris_json
_SUCCESS
part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
# 読み込みも同様
df_json = spark.read.json("testdata/iris_json")
# or df_json = spark.read.load("testdata/iris_json", format="json")
display(df_json)
df_json.show(3)
DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|petal_length|petal_width|sepal_length|sepal_width| species|
+------------+-----------+------------+-----------+----------+
| 3.3| 1.0| 5.0| 2.3|versicolor|
| 4.2| 1.3| 5.6| 2.7|versicolor|
| 4.2| 1.2| 5.7| 3.0|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
(↑jsonだとcsvよりは型情報が保持される)
# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など
df.write.parquet("testdata/iris_parquet", compression="snappy")
!ls testdata/iris_parquet
df.write.orc("testdata/iris_orc", compression="zlib")
!ls testdata/iris_orc
23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
_SUCCESS
part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
_SUCCESS
part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
# 読み込み
# parquet
print("parquet:")
df_parquet = spark.read.parquet("testdata/iris_parquet")
display(df_parquet)
df_parquet.show(3)
print("\n------------------\n")
# orc
print("orc:")
df_orc = spark.read.orc("testdata/iris_orc")
display(df_orc)
df_orc.show(3)
parquet:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
------------------
orc:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
# partitionを切って保存することも可能
df.write.save(
"testdata/iris_with_partition",
format="parquet",
compression="snappy",
partitionBy="species"
)
!cd testdata/iris_with_partition && tree
�[00;38;5;33m.�[0m
├── _SUCCESS
├── �[00;38;5;33mspecies=setosa�[0m
│ ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── �[00;38;5;33mspecies=versicolor�[0m
│ ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── �[00;38;5;33mspecies=virginica�[0m
├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
4 directories, 11 files
カラム"species"の値ごとに別ディレクトリが切られてデータが保存される
RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る(し、不適切だと悪化する)
↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。 一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。
# パーティションがある場合でも読み込み方法は同様
df_partition = spark.read.load("testdata/iris_with_partition")
display(df_partition)
df_partition.show(3)
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 7.9| 3.8| 6.4| 2.0|virginica|
| 6.4| 2.8| 5.6| 2.2|virginica|
| 6.3| 2.8| 5.1| 1.5|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
他にもsaveAsTableで永続化したテーブルとして書き込みを行ったりも出来る。
扱えるデータ保存形式もData Sourcesのように、 他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。
import pyspark.pandas as ps
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.
warnings.warn(
参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html
最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子
なお、Koalasという、sparkをpandasっぽく使えるツールが存在するが、 ここなどを見るとこれが正式にpysparkに取り込まれた?様子。
import pyspark.pandas as ps
# pandas on spark DataFrameを作成する
psdf = ps.read_csv("testdata/iris.csv")
display(psdf.__class__) # pyspark.pandas.frame.DataFrame
psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pyspark.pandas.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# filterや表示も通常のpandasのようにできている
psdf[psdf['sepal_length'] > 7.5]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
105 | 7.6 | 3.0 | 6.6 | 2.1 | virginica |
117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
118 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
122 | 7.7 | 2.8 | 6.7 | 2.0 | virginica |
131 | 7.9 | 3.8 | 6.4 | 2.0 | virginica |
135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
# pysparkのDataFrameに変換
sdf = psdf.to_spark(index_col=["index"])
sdf.show(5)
+-----+------------+-----------+------------+-----------+-------+
|index|sepal_length|sepal_width|petal_length|petal_width|species|
+-----+------------+-----------+------------+-----------+-------+
| 0| 5.1| 3.5| 1.4| 0.2| setosa|
| 1| 4.9| 3.0| 1.4| 0.2| setosa|
| 2| 4.7| 3.2| 1.3| 0.2| setosa|
| 3| 4.6| 3.1| 1.5| 0.2| setosa|
| 4| 5.0| 3.6| 1.4| 0.2| setosa|
+-----+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# 普通のpandasに変換
pdf = psdf.to_pandas()
display(pdf.__class__) # pandas.core.frame.DataFrame
pdf.head()
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pandas.core.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。
例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、 データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。
pysparkはdaskに比べると、
name: pyspark-intro | |
channels: | |
- conda-forge | |
dependencies: | |
- python=3.11 | |
- pyspark>=3.4 | |
- openjdk=17 | |
- pandas>=2.0.1 | |
- pyarrow | |
# for testdata | |
- numpy | |
- seaborn | |
# jupyter | |
- jupyterlab |
name: pyspark-intro | |
channels: | |
- conda-forge | |
dependencies: | |
- _libgcc_mutex=0.1=conda_forge | |
- _openmp_mutex=4.5=2_gnu | |
- aiofiles=22.1.0=pyhd8ed1ab_0 | |
- aiosqlite=0.18.0=pyhd8ed1ab_0 | |
- alsa-lib=1.2.8=h166bdaf_0 | |
- anyio=3.6.2=pyhd8ed1ab_0 | |
- argon2-cffi=21.3.0=pyhd8ed1ab_0 | |
- argon2-cffi-bindings=21.2.0=py311hd4cff14_3 | |
- arrow-cpp=11.0.0=ha770c72_14_cpu | |
- asttokens=2.2.1=pyhd8ed1ab_0 | |
- attrs=22.2.0=pyh71513ae_0 | |
- aws-c-auth=0.6.26=hf365957_1 | |
- aws-c-cal=0.5.21=h48707d8_2 | |
- aws-c-common=0.8.14=h0b41bf4_0 | |
- aws-c-compression=0.2.16=h03acc5a_5 | |
- aws-c-event-stream=0.2.20=h00877a2_4 | |
- aws-c-http=0.7.6=hf342b9f_0 | |
- aws-c-io=0.13.19=h5b20300_3 | |
- aws-c-mqtt=0.8.6=hc4349f7_12 | |
- aws-c-s3=0.2.7=h909e904_1 | |
- aws-c-sdkutils=0.1.8=h03acc5a_0 | |
- aws-checksums=0.1.14=h03acc5a_5 | |
- aws-crt-cpp=0.19.8=hf7fbfca_12 | |
- aws-sdk-cpp=1.10.57=h17c43bd_8 | |
- babel=2.12.1=pyhd8ed1ab_1 | |
- backcall=0.2.0=pyh9f0ad1d_0 | |
- backports=1.0=pyhd8ed1ab_3 | |
- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 | |
- beautifulsoup4=4.12.2=pyha770c72_0 | |
- bleach=6.0.0=pyhd8ed1ab_0 | |
- brotli=1.0.9=h166bdaf_8 | |
- brotli-bin=1.0.9=h166bdaf_8 | |
- brotlipy=0.7.0=py311hd4cff14_1005 | |
- bzip2=1.0.8=h7f98852_4 | |
- c-ares=1.18.1=h7f98852_0 | |
- ca-certificates=2022.12.7=ha878542_0 | |
- cairo=1.16.0=h35add3b_1015 | |
- certifi=2022.12.7=pyhd8ed1ab_0 | |
- cffi=1.15.1=py311h409f033_3 | |
- charset-normalizer=3.1.0=pyhd8ed1ab_0 | |
- comm=0.1.3=pyhd8ed1ab_0 | |
- contourpy=1.0.7=py311ha3edf6b_0 | |
- cryptography=40.0.2=py311h9b4c7bb_0 | |
- cycler=0.11.0=pyhd8ed1ab_0 | |
- debugpy=1.6.7=py311hcafe171_0 | |
- decorator=5.1.1=pyhd8ed1ab_0 | |
- defusedxml=0.7.1=pyhd8ed1ab_0 | |
- entrypoints=0.4=pyhd8ed1ab_0 | |
- executing=1.2.0=pyhd8ed1ab_0 | |
- expat=2.5.0=hcb278e6_1 | |
- flit-core=3.8.0=pyhd8ed1ab_0 | |
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0 | |
- font-ttf-inconsolata=3.000=h77eed37_0 | |
- font-ttf-source-code-pro=2.038=h77eed37_0 | |
- font-ttf-ubuntu=0.83=hab24e00_0 | |
- fontconfig=2.14.2=h14ed4e7_0 | |
- fonts-conda-ecosystem=1=0 | |
- fonts-conda-forge=1=0 | |
- fonttools=4.39.3=py311h2582759_0 | |
- freetype=2.12.1=hca18f0e_1 | |
- gettext=0.21.1=h27087fc_0 | |
- gflags=2.2.2=he1b5a44_1004 | |
- giflib=5.2.1=h0b41bf4_3 | |
- glog=0.6.0=h6f12383_0 | |
- graphite2=1.3.13=h58526e2_1001 | |
- harfbuzz=6.0.0=h3ff4399_1 | |
- icu=72.1=hcb278e6_0 | |
- idna=3.4=pyhd8ed1ab_0 | |
- importlib-metadata=6.6.0=pyha770c72_0 | |
- importlib_metadata=6.6.0=hd8ed1ab_0 | |
- importlib_resources=5.12.0=pyhd8ed1ab_0 | |
- ipykernel=6.22.0=pyh210e3f2_0 | |
- ipython=8.12.0=pyh41d4057_0 | |
- ipython_genutils=0.2.0=py_1 | |
- jedi=0.18.2=pyhd8ed1ab_0 | |
- jinja2=3.1.2=pyhd8ed1ab_1 | |
- json5=0.9.5=pyh9f0ad1d_0 | |
- jsonschema=4.17.3=pyhd8ed1ab_0 | |
- jupyter_client=8.2.0=pyhd8ed1ab_0 | |
- jupyter_core=5.3.0=py311h38be061_0 | |
- jupyter_events=0.6.3=pyhd8ed1ab_0 | |
- jupyter_server=2.5.0=pyhd8ed1ab_0 | |
- jupyter_server_fileid=0.9.0=pyhd8ed1ab_0 | |
- jupyter_server_terminals=0.4.4=pyhd8ed1ab_1 | |
- jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0 | |
- jupyter_ydoc=0.2.3=pyhd8ed1ab_0 | |
- jupyterlab=3.6.3=pyhd8ed1ab_0 | |
- jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 | |
- jupyterlab_server=2.22.1=pyhd8ed1ab_0 | |
- keyutils=1.6.1=h166bdaf_0 | |
- kiwisolver=1.4.4=py311h4dd048b_1 | |
- krb5=1.20.1=h81ceb04_0 | |
- lcms2=2.15=haa2dc70_1 | |
- ld_impl_linux-64=2.40=h41732ed_0 | |
- lerc=4.0.0=h27087fc_0 | |
- libabseil=20230125.0=cxx17_hcb278e6_1 | |
- libarrow=11.0.0=h93537a5_14_cpu | |
- libblas=3.9.0=16_linux64_openblas | |
- libbrotlicommon=1.0.9=h166bdaf_8 | |
- libbrotlidec=1.0.9=h166bdaf_8 | |
- libbrotlienc=1.0.9=h166bdaf_8 | |
- libcblas=3.9.0=16_linux64_openblas | |
- libcrc32c=1.1.2=h9c3ff4c_0 | |
- libcups=2.3.3=h36d4200_3 | |
- libcurl=8.0.1=h588be90_0 | |
- libdeflate=1.18=h0b41bf4_0 | |
- libedit=3.1.20191231=he28a2e2_2 | |
- libev=4.33=h516909a_1 | |
- libevent=2.1.10=h28343ad_4 | |
- libexpat=2.5.0=hcb278e6_1 | |
- libffi=3.4.2=h7f98852_5 | |
- libgcc-ng=12.2.0=h65d4601_19 | |
- libgfortran-ng=12.2.0=h69a702a_19 | |
- libgfortran5=12.2.0=h337968e_19 | |
- libglib=2.76.2=hebfc3b9_0 | |
- libgomp=12.2.0=h65d4601_19 | |
- libgoogle-cloud=2.8.0=h0bc5f78_1 | |
- libgrpc=1.52.1=hcf146ea_1 | |
- libiconv=1.17=h166bdaf_0 | |
- libjpeg-turbo=2.1.5.1=h0b41bf4_0 | |
- liblapack=3.9.0=16_linux64_openblas | |
- libnghttp2=1.52.0=h61bc06f_0 | |
- libnsl=2.0.0=h7f98852_0 | |
- libnuma=2.0.16=h0b41bf4_1 | |
- libopenblas=0.3.21=pthreads_h78a6416_3 | |
- libpng=1.6.39=h753d276_0 | |
- libprotobuf=3.21.12=h3eb15da_0 | |
- libsodium=1.0.18=h36c2ea0_1 | |
- libsqlite=3.40.0=h753d276_1 | |
- libssh2=1.10.0=hf14f497_3 | |
- libstdcxx-ng=12.2.0=h46fd767_19 | |
- libthrift=0.18.1=h5e4af38_0 | |
- libtiff=4.5.0=ha587672_6 | |
- libutf8proc=2.8.0=h166bdaf_0 | |
- libuuid=2.38.1=h0b41bf4_0 | |
- libwebp-base=1.3.0=h0b41bf4_0 | |
- libxcb=1.13=h7f98852_1004 | |
- libzlib=1.2.13=h166bdaf_4 | |
- lz4-c=1.9.4=hcb278e6_0 | |
- markupsafe=2.1.2=py311h2582759_0 | |
- matplotlib-base=3.7.1=py311h8597a09_0 | |
- matplotlib-inline=0.1.6=pyhd8ed1ab_0 | |
- mistune=2.0.5=pyhd8ed1ab_0 | |
- munkres=1.1.4=pyh9f0ad1d_0 | |
- nbclassic=0.5.5=pyhb4ecaf3_1 | |
- nbclient=0.7.3=pyhd8ed1ab_0 | |
- nbconvert=7.3.1=pyhd8ed1ab_0 | |
- nbconvert-core=7.3.1=pyhd8ed1ab_0 | |
- nbconvert-pandoc=7.3.1=pyhd8ed1ab_0 | |
- nbformat=5.8.0=pyhd8ed1ab_0 | |
- ncurses=6.3=h27087fc_1 | |
- nest-asyncio=1.5.6=pyhd8ed1ab_0 | |
- notebook=6.5.4=pyha770c72_0 | |
- notebook-shim=0.2.2=pyhd8ed1ab_0 | |
- numpy=1.24.3=py311h64a7726_0 | |
- openjdk=17.0.3=h4335b31_6 | |
- openjpeg=2.5.0=hfec8fc6_2 | |
- openssl=3.1.0=hd590300_1 | |
- orc=1.8.3=hfdbbad2_0 | |
- packaging=23.1=pyhd8ed1ab_0 | |
- pandas=2.0.1=py311h320fe9a_0 | |
- pandoc=2.19.2=h32600fe_2 | |
- pandocfilters=1.5.0=pyhd8ed1ab_0 | |
- parquet-cpp=1.5.1=2 | |
- parso=0.8.3=pyhd8ed1ab_0 | |
- patsy=0.5.3=pyhd8ed1ab_0 | |
- pcre2=10.40=hc3806b6_0 | |
- pexpect=4.8.0=pyh1a96a4e_2 | |
- pickleshare=0.7.5=py_1003 | |
- pillow=9.5.0=py311h573f0d3_0 | |
- pip=23.1.1=pyhd8ed1ab_0 | |
- pixman=0.40.0=h36c2ea0_0 | |
- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 | |
- platformdirs=3.2.0=pyhd8ed1ab_0 | |
- pooch=1.7.0=pyha770c72_3 | |
- prometheus_client=0.16.0=pyhd8ed1ab_0 | |
- prompt-toolkit=3.0.38=pyha770c72_0 | |
- prompt_toolkit=3.0.38=hd8ed1ab_0 | |
- psutil=5.9.5=py311h2582759_0 | |
- pthread-stubs=0.4=h36c2ea0_1001 | |
- ptyprocess=0.7.0=pyhd3deb0d_0 | |
- pure_eval=0.2.2=pyhd8ed1ab_0 | |
- py4j=0.10.9.7=pyhd8ed1ab_0 | |
- pyarrow=11.0.0=py311hbdf6286_14_cpu | |
- pycparser=2.21=pyhd8ed1ab_0 | |
- pygments=2.15.1=pyhd8ed1ab_0 | |
- pyopenssl=23.1.1=pyhd8ed1ab_0 | |
- pyparsing=3.0.9=pyhd8ed1ab_0 | |
- pyrsistent=0.19.3=py311h2582759_0 | |
- pysocks=1.7.1=pyha2e5f31_6 | |
- pyspark=3.4.0=pyhd8ed1ab_0 | |
- python=3.11.3=h2755cc3_0_cpython | |
- python-dateutil=2.8.2=pyhd8ed1ab_0 | |
- python-fastjsonschema=2.16.3=pyhd8ed1ab_0 | |
- python-json-logger=2.0.7=pyhd8ed1ab_0 | |
- python-tzdata=2023.3=pyhd8ed1ab_0 | |
- python_abi=3.11=3_cp311 | |
- pytz=2023.3=pyhd8ed1ab_0 | |
- pyyaml=6.0=py311hd4cff14_5 | |
- pyzmq=25.0.2=py311hd6ccaeb_0 | |
- re2=2023.02.02=hcb278e6_0 | |
- readline=8.2=h8228510_1 | |
- requests=2.28.2=pyhd8ed1ab_1 | |
- rfc3339-validator=0.1.4=pyhd8ed1ab_0 | |
- rfc3986-validator=0.1.1=pyh9f0ad1d_0 | |
- s2n=1.3.41=h3358134_0 | |
- scipy=1.10.1=py311h8e6699e_0 | |
- seaborn=0.12.2=hd8ed1ab_0 | |
- seaborn-base=0.12.2=pyhd8ed1ab_0 | |
- send2trash=1.8.0=pyhd8ed1ab_0 | |
- setuptools=67.7.1=pyhd8ed1ab_0 | |
- six=1.16.0=pyh6c4a22f_0 | |
- snappy=1.1.10=h9fff704_0 | |
- sniffio=1.3.0=pyhd8ed1ab_0 | |
- soupsieve=2.3.2.post1=pyhd8ed1ab_0 | |
- stack_data=0.6.2=pyhd8ed1ab_0 | |
- statsmodels=0.13.5=py311h4c7f6c3_2 | |
- terminado=0.17.1=pyh41d4057_0 | |
- tinycss2=1.2.1=pyhd8ed1ab_0 | |
- tk=8.6.12=h27826a3_0 | |
- tomli=2.0.1=pyhd8ed1ab_0 | |
- tornado=6.3=py311h2582759_0 | |
- traitlets=5.9.0=pyhd8ed1ab_0 | |
- typing-extensions=4.5.0=hd8ed1ab_0 | |
- typing_extensions=4.5.0=pyha770c72_0 | |
- tzdata=2023c=h71feb2d_0 | |
- ucx=1.14.0=h8c404fb_1 | |
- urllib3=1.26.15=pyhd8ed1ab_0 | |
- wcwidth=0.2.6=pyhd8ed1ab_0 | |
- webencodings=0.5.1=py_1 | |
- websocket-client=1.5.1=pyhd8ed1ab_0 | |
- wheel=0.40.0=pyhd8ed1ab_0 | |
- xorg-fixesproto=5.0=h7f98852_1002 | |
- xorg-inputproto=2.3.2=h7f98852_1002 | |
- xorg-kbproto=1.0.7=h7f98852_1002 | |
- xorg-libice=1.0.10=h7f98852_0 | |
- xorg-libsm=1.2.3=hd9c2040_1000 | |
- xorg-libx11=1.8.4=h0b41bf4_0 | |
- xorg-libxau=1.0.9=h7f98852_0 | |
- xorg-libxdmcp=1.1.3=h7f98852_0 | |
- xorg-libxext=1.3.4=h0b41bf4_2 | |
- xorg-libxfixes=5.0.3=h7f98852_1004 | |
- xorg-libxi=1.7.10=h7f98852_0 | |
- xorg-libxrender=0.9.10=h7f98852_1003 | |
- xorg-libxtst=1.2.3=h7f98852_1002 | |
- xorg-recordproto=1.14.2=h7f98852_1002 | |
- xorg-renderproto=0.11.1=h7f98852_1002 | |
- xorg-xextproto=7.3.0=h0b41bf4_1003 | |
- xorg-xproto=7.0.31=h7f98852_1007 | |
- xz=5.2.6=h166bdaf_0 | |
- y-py=0.5.9=py311hfe55011_0 | |
- yaml=0.2.5=h7f98852_2 | |
- ypy-websocket=0.8.2=pyhd8ed1ab_0 | |
- zeromq=4.3.4=h9c3ff4c_1 | |
- zipp=3.15.0=pyhd8ed1ab_0 | |
- zlib=1.2.13=h166bdaf_4 | |
- zstd=1.5.2=h3eb15da_6 | |
# prefix: /path/to/pyspark-intro |