- Linux環境でMambaforgeを使って環境を構築する
- クラスター管理などはおいておいて、学習用の最低限の環境を手軽に作る
- python3.11
- pyspark3.4.0
- openjdk17
- pandas2.0.1
- pandasとの連携を確認するため
- その他、jupyterなど
# use mambaforge
mamba env create -f pyspark-env.yaml| testdata | |
| # Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all | |
| # Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all | |
| ### JupyterNotebooks ### | |
| # gitignore template for Jupyter Notebooks | |
| # website: http://jupyter.org/ | |
| .ipynb_checkpoints | |
| */.ipynb_checkpoints/* | |
| # IPython | |
| profile_default/ | |
| ipython_config.py | |
| # Remove previous ipynb_checkpoints | |
| # git rm -r .ipynb_checkpoints/ | |
| ### PyCharm+all ### | |
| # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider | |
| # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 | |
| # User-specific stuff | |
| .idea/**/workspace.xml | |
| .idea/**/tasks.xml | |
| .idea/**/usage.statistics.xml | |
| .idea/**/dictionaries | |
| .idea/**/shelf | |
| # AWS User-specific | |
| .idea/**/aws.xml | |
| # Generated files | |
| .idea/**/contentModel.xml | |
| # Sensitive or high-churn files | |
| .idea/**/dataSources/ | |
| .idea/**/dataSources.ids | |
| .idea/**/dataSources.local.xml | |
| .idea/**/sqlDataSources.xml | |
| .idea/**/dynamic.xml | |
| .idea/**/uiDesigner.xml | |
| .idea/**/dbnavigator.xml | |
| # Gradle | |
| .idea/**/gradle.xml | |
| .idea/**/libraries | |
| # Gradle and Maven with auto-import | |
| # When using Gradle or Maven with auto-import, you should exclude module files, | |
| # since they will be recreated, and may cause churn. Uncomment if using | |
| # auto-import. | |
| # .idea/artifacts | |
| # .idea/compiler.xml | |
| # .idea/jarRepositories.xml | |
| # .idea/modules.xml | |
| # .idea/*.iml | |
| # .idea/modules | |
| # *.iml | |
| # *.ipr | |
| # CMake | |
| cmake-build-*/ | |
| # Mongo Explorer plugin | |
| .idea/**/mongoSettings.xml | |
| # File-based project format | |
| *.iws | |
| # IntelliJ | |
| out/ | |
| # mpeltonen/sbt-idea plugin | |
| .idea_modules/ | |
| # JIRA plugin | |
| atlassian-ide-plugin.xml | |
| # Cursive Clojure plugin | |
| .idea/replstate.xml | |
| # SonarLint plugin | |
| .idea/sonarlint/ | |
| # Crashlytics plugin (for Android Studio and IntelliJ) | |
| com_crashlytics_export_strings.xml | |
| crashlytics.properties | |
| crashlytics-build.properties | |
| fabric.properties | |
| # Editor-based Rest Client | |
| .idea/httpRequests | |
| # Android studio 3.1+ serialized cache file | |
| .idea/caches/build_file_checksums.ser | |
| ### PyCharm+all Patch ### | |
| # Ignore everything but code style settings and run configurations | |
| # that are supposed to be shared within teams. | |
| .idea/* | |
| !.idea/codeStyles | |
| !.idea/runConfigurations | |
| ### Python ### | |
| # Byte-compiled / optimized / DLL files | |
| __pycache__/ | |
| *.py[cod] | |
| *$py.class | |
| # C extensions | |
| *.so | |
| # Distribution / packaging | |
| .Python | |
| build/ | |
| develop-eggs/ | |
| dist/ | |
| downloads/ | |
| eggs/ | |
| .eggs/ | |
| lib/ | |
| lib64/ | |
| parts/ | |
| sdist/ | |
| var/ | |
| wheels/ | |
| share/python-wheels/ | |
| *.egg-info/ | |
| .installed.cfg | |
| *.egg | |
| MANIFEST | |
| # PyInstaller | |
| # Usually these files are written by a python script from a template | |
| # before PyInstaller builds the exe, so as to inject date/other infos into it. | |
| *.manifest | |
| *.spec | |
| # Installer logs | |
| pip-log.txt | |
| pip-delete-this-directory.txt | |
| # Unit test / coverage reports | |
| htmlcov/ | |
| .tox/ | |
| .nox/ | |
| .coverage | |
| .coverage.* | |
| .cache | |
| nosetests.xml | |
| coverage.xml | |
| *.cover | |
| *.py,cover | |
| .hypothesis/ | |
| .pytest_cache/ | |
| cover/ | |
| # Translations | |
| *.mo | |
| *.pot | |
| # Django stuff: | |
| *.log | |
| local_settings.py | |
| db.sqlite3 | |
| db.sqlite3-journal | |
| # Flask stuff: | |
| instance/ | |
| .webassets-cache | |
| # Scrapy stuff: | |
| .scrapy | |
| # Sphinx documentation | |
| docs/_build/ | |
| # PyBuilder | |
| .pybuilder/ | |
| target/ | |
| # Jupyter Notebook | |
| # IPython | |
| # pyenv | |
| # For a library or package, you might want to ignore these files since the code is | |
| # intended to run in multiple environments; otherwise, check them in: | |
| # .python-version | |
| # pipenv | |
| # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | |
| # However, in case of collaboration, if having platform-specific dependencies or dependencies | |
| # having no cross-platform support, pipenv may install dependencies that don't work, or not | |
| # install all needed dependencies. | |
| #Pipfile.lock | |
| # poetry | |
| # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. | |
| # This is especially recommended for binary packages to ensure reproducibility, and is more | |
| # commonly ignored for libraries. | |
| # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control | |
| #poetry.lock | |
| # pdm | |
| # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. | |
| #pdm.lock | |
| # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it | |
| # in version control. | |
| # https://pdm.fming.dev/#use-with-ide | |
| .pdm.toml | |
| # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm | |
| __pypackages__/ | |
| # Celery stuff | |
| celerybeat-schedule | |
| celerybeat.pid | |
| # SageMath parsed files | |
| *.sage.py | |
| # Environments | |
| .env | |
| .venv | |
| env/ | |
| venv/ | |
| ENV/ | |
| env.bak/ | |
| venv.bak/ | |
| # Spyder project settings | |
| .spyderproject | |
| .spyproject | |
| # Rope project settings | |
| .ropeproject | |
| # mkdocs documentation | |
| /site | |
| # mypy | |
| .mypy_cache/ | |
| .dmypy.json | |
| dmypy.json | |
| # Pyre type checker | |
| .pyre/ | |
| # pytype static type analyzer | |
| .pytype/ | |
| # Cython debug symbols | |
| cython_debug/ | |
| # PyCharm | |
| # JetBrains specific template is maintained in a separate JetBrains.gitignore that can | |
| # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore | |
| # and can be added to the global gitignore or merged into this file. For a more nuclear | |
| # option (not recommended) you can uncomment the following to ignore the entire idea folder. | |
| #.idea/ | |
| ### Python Patch ### | |
| # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration | |
| poetry.toml | |
| # ruff | |
| .ruff_cache/ | |
| # LSP config files | |
| pyrightconfig.json | |
| ### VisualStudioCode ### | |
| .vscode/* | |
| !.vscode/settings.json | |
| !.vscode/tasks.json | |
| !.vscode/launch.json | |
| !.vscode/extensions.json | |
| !.vscode/*.code-snippets | |
| # Local History for Visual Studio Code | |
| .history/ | |
| # Built Visual Studio Code Extensions | |
| *.vsix | |
| ### VisualStudioCode Patch ### | |
| # Ignore all local history of files | |
| .history | |
| .ionide | |
| # End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all |
| { | |
| "cells": [ | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "e40569e7", | |
| "metadata": {}, | |
| "source": [ | |
| "# Pyspark入門\n", | |
| "\n", | |
| "pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、\n", | |
| "メモリに乗り切らないレベルのデータ量は処理できない。\n", | |
| "また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。\n", | |
| "\n", | |
| "遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、\n", | |
| "更にpandasとの連携・併用ができるツールとして[pyspark](https://www.databricks.com/jp/glossary/pyspark)が存在するため紹介する。\n", | |
| "\n", | |
| "なお、詳細は[PySpark Documentation](https://spark.apache.org/docs/latest/api/python/)なども参照のこと\n" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "c612301e", | |
| "metadata": {}, | |
| "source": [ | |
| "## 環境の作成\n", | |
| "\n", | |
| "Mambaforgeをインストール済みのLinux環境を前提とする。\n", | |
| "\n", | |
| "- conda 23.1.0\n", | |
| "- mamba 1.4.1\n", | |
| "\n", | |
| "以下のようなconda仮想環境作成用のyamlを用意する:\n", | |
| "\n", | |
| "```yaml:pyspark_env.yaml\n", | |
| "# pyspark_env.yaml\n", | |
| "name: pyspark-intro\n", | |
| "channels:\n", | |
| " - conda-forge\n", | |
| "dependencies:\n", | |
| " - python=3.11\n", | |
| " - pyspark>=3.4\n", | |
| " - openjdk=17\n", | |
| " - pandas>=2.0.1\n", | |
| " - pyarrow\n", | |
| " # for testdata\n", | |
| " - numpy\n", | |
| " - seaborn\n", | |
| " # jupyter\n", | |
| " - jupyterlab\n", | |
| "```\n", | |
| "\n", | |
| "※~~先日[pandasの2.0がリリースされた](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html)が、\n", | |
| "deprecatedになっていた[`iteritems`](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#removal-of-prior-version-deprecations-changes)が削除された影響で、\n", | |
| "pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。~~\n", | |
| "`pyspark>=3.4.0`で修正済みなので、pandasも2系を使う\n", | |
| "\n", | |
| "\n", | |
| "mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)\n", | |
| "\n", | |
| "```sh\n", | |
| "mamba env create -f pyspark_env.yaml\n", | |
| "\n", | |
| "# 出来上がった環境(pyspark-intro)をactivate\n", | |
| "conda activate pyspark-intro\n", | |
| "# or `mamba activate pyspark-intro`\n", | |
| "```" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "4290e154", | |
| "metadata": {}, | |
| "source": [ | |
| "## 実行\n", | |
| "\n", | |
| "- spark sessionの起動\n", | |
| "- データの読み込みと保存\n", | |
| "- pandasとの連携\n", | |
| "\n", | |
| "あたりを簡単に確認する。\n", | |
| "\n", | |
| "データ処理周りの詳細は省略するが、Sparkのデータフレーム機能が[Spark SQL](https://www.databricks.com/jp/glossary/what-is-spark-sql)と対応していることから、\n", | |
| "概ねSQLで出来ることようなことを実行出来ると考えればOK。(というかSQLでも書ける)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "17452c65", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)\n", | |
| "\n", | |
| "!rm -rf testdata && mkdir testdata" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "04ef6a3d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "your 131072x1 screen size is bogus. expect trouble\n", | |
| "23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)\n", | |
| "23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", | |
| "Setting default log level to \"WARN\".\n", | |
| "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", | |
| "23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "\n", | |
| " <div>\n", | |
| " <p><b>SparkSession - in-memory</b></p>\n", | |
| " \n", | |
| " <div>\n", | |
| " <p><b>SparkContext</b></p>\n", | |
| "\n", | |
| " <p><a href=\"http://172.24.210.1:4040\">Spark UI</a></p>\n", | |
| "\n", | |
| " <dl>\n", | |
| " <dt>Version</dt>\n", | |
| " <dd><code>v3.4.0</code></dd>\n", | |
| " <dt>Master</dt>\n", | |
| " <dd><code>local[*]</code></dd>\n", | |
| " <dt>AppName</dt>\n", | |
| " <dd><code>LocalTest</code></dd>\n", | |
| " </dl>\n", | |
| " </div>\n", | |
| " \n", | |
| " </div>\n", | |
| " " | |
| ], | |
| "text/plain": [ | |
| "<pyspark.sql.session.SparkSession at 0x7ff1d5736cd0>" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from pyspark.sql.session import SparkSession\n", | |
| "\n", | |
| "# spark sessionの立ち上げ\n", | |
| "spark = (SparkSession.builder\n", | |
| " .master(\"local[*]\")\n", | |
| " .appName(\"LocalTest\")\n", | |
| " .config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n", | |
| " .config(\"spark.executor.memory\", \"4g\") # TMP (実際はハードコートはあまり良くない)\n", | |
| " .getOrCreate()\n", | |
| ")\n", | |
| "spark" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "71bf72b6", | |
| "metadata": {}, | |
| "source": [ | |
| "↑`config`部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)\n", | |
| "\n", | |
| "参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion\n", | |
| "\n", | |
| "なお、詳細かつ環境依存性のあるconfigは別途configファイル(`$SPARK_HOME/conf/spark-defaults.conf`)を作成するなどして設定すると良い\n", | |
| "\n", | |
| "コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html\n", | |
| "\n", | |
| "また、コンフィグファイルの置き場所の設定などに必要な環境変数`SPARK_HOME`について、condaでインストールした場合の設定方法などは[ここ](https://qiita.com/junkor-1011/items/7ec9bfaaf76568ce4a05#spark_home%E3%81%AE%E8%A8%AD%E5%AE%9A)などを参照" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "d6ac5ce7", | |
| "metadata": {}, | |
| "source": [ | |
| "Spark Sessionを起動すると、デフォルトでは[http://localhost:4040](http://localhost:4040)に[Spark UI](http://localhost:4040)が立ち上がり、\n", | |
| "実行状態などをモニタリングすることが出来る(デバッグなどにも便利)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "733b9529", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>sepal_length</th>\n", | |
| " <th>sepal_width</th>\n", | |
| " <th>petal_length</th>\n", | |
| " <th>petal_width</th>\n", | |
| " <th>species</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>5.1</td>\n", | |
| " <td>3.5</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>4.9</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>4.7</td>\n", | |
| " <td>3.2</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4.6</td>\n", | |
| " <td>3.1</td>\n", | |
| " <td>1.5</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.6</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>145</th>\n", | |
| " <td>6.7</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.2</td>\n", | |
| " <td>2.3</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>146</th>\n", | |
| " <td>6.3</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>1.9</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>147</th>\n", | |
| " <td>6.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.2</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>148</th>\n", | |
| " <td>6.2</td>\n", | |
| " <td>3.4</td>\n", | |
| " <td>5.4</td>\n", | |
| " <td>2.3</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>149</th>\n", | |
| " <td>5.9</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.1</td>\n", | |
| " <td>1.8</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>150 rows × 5 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " sepal_length sepal_width petal_length petal_width species\n", | |
| "0 5.1 3.5 1.4 0.2 setosa\n", | |
| "1 4.9 3.0 1.4 0.2 setosa\n", | |
| "2 4.7 3.2 1.3 0.2 setosa\n", | |
| "3 4.6 3.1 1.5 0.2 setosa\n", | |
| "4 5.0 3.6 1.4 0.2 setosa\n", | |
| ".. ... ... ... ... ...\n", | |
| "145 6.7 3.0 5.2 2.3 virginica\n", | |
| "146 6.3 2.5 5.0 1.9 virginica\n", | |
| "147 6.5 3.0 5.2 2.0 virginica\n", | |
| "148 6.2 3.4 5.4 2.3 virginica\n", | |
| "149 5.9 3.0 5.1 1.8 virginica\n", | |
| "\n", | |
| "[150 rows x 5 columns]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "# テストデータの作成\n", | |
| "import seaborn as sns\n", | |
| "\n", | |
| "iris = sns.load_dataset(\"iris\")\n", | |
| "display(iris)\n", | |
| "\n", | |
| "iris.to_csv(\"testdata/iris.csv\", index=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "60b743f0", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "# pysparkでcsvを読み込み\n", | |
| "\n", | |
| "df_iris = spark.read.csv(\n", | |
| " \"testdata/iris.csv\",\n", | |
| " header=True,\n", | |
| " inferSchema=True,\n", | |
| ")\n", | |
| "\n", | |
| "# 遅延評価するので、カラムのスキーマのみ表示される\n", | |
| "display(df_iris)" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "ebf1496e", | |
| "metadata": {}, | |
| "source": [ | |
| "(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)\n", | |
| "\n", | |
| "\n", | |
| "※(補足):\n", | |
| "\n", | |
| "- 遅延評価を行う関係で、オンメモリに展開するpandasなどと異なりSparkの処理のパフォーマンスはデータソースの形式の影響をもろに受ける\n", | |
| " - データ形式(csv/json/parquet/orc/RDBテーブル/...etc)や、圧縮形式(gzip/xz/snappy/zlib/...etc)、カラムに対するパーティションなど\n", | |
| "- データ分析・データ処理用途であれば行指向形式であるcsvを読み込んで処理を実行するのは一般に遅く、列指向形式のparquetやorcなどの方が有利" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "e89db683", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
| "| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
| "| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
| "| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
| "| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "only showing top 5 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# showなどによって初めて評価が行われる\n", | |
| "df_iris.show(5) # 5件を表示" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "3d4c8b6e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+\n", | |
| "|sepal_length|sepal_width|\n", | |
| "+------------+-----------+\n", | |
| "| 5.1| 3.5|\n", | |
| "| 4.9| 3.0|\n", | |
| "| 4.7| 3.2|\n", | |
| "+------------+-----------+\n", | |
| "only showing top 3 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# selectによるカラムの選択\n", | |
| "(df_iris\n", | |
| " .select([\"sepal_length\", \"sepal_width\"])\n", | |
| " .show(3)\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "0f5df129", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+---------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
| "+------------+-----------+------------+-----------+---------+\n", | |
| "| 6.3| 3.3| 6.0| 2.5|virginica|\n", | |
| "| 5.8| 2.7| 5.1| 1.9|virginica|\n", | |
| "| 7.1| 3.0| 5.9| 2.1|virginica|\n", | |
| "+------------+-----------+------------+-----------+---------+\n", | |
| "only showing top 3 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# filterによる簡単なクエリ\n", | |
| "(df_iris\n", | |
| " .filter(df_iris.species == \"virginica\")\n", | |
| " .filter(df_iris.sepal_length > 5.0)\n", | |
| " .show(3)\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "77abd749", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+----------+-----+\n", | |
| "| species|count|\n", | |
| "+----------+-----+\n", | |
| "| virginica| 50|\n", | |
| "|versicolor| 50|\n", | |
| "| setosa| 50|\n", | |
| "+----------+-----+\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# groupByによる集約(1)\n", | |
| "(df_iris\n", | |
| " .groupBy('species')\n", | |
| " .count()\n", | |
| " .show() \n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "a998da4d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+----------+------------------+------------------+-----------------+------------------+\n", | |
| "| species| avg(sepal_width)| avg(petal_width)|avg(sepal_length)| avg(petal_length)|\n", | |
| "+----------+------------------+------------------+-----------------+------------------+\n", | |
| "| virginica|2.9739999999999998| 2.026|6.587999999999998| 5.552|\n", | |
| "|versicolor|2.7700000000000005|1.3259999999999998| 5.936| 4.26|\n", | |
| "| setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|\n", | |
| "+----------+------------------+------------------+-----------------+------------------+\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# groupByによる集約(2)\n", | |
| "\n", | |
| "(df_iris\n", | |
| " .groupBy('species')\n", | |
| " .agg({\n", | |
| " 'sepal_length': 'mean',\n", | |
| " 'sepal_width': 'mean',\n", | |
| " 'petal_length': 'mean',\n", | |
| " 'petal_width': 'mean',\n", | |
| " })\n", | |
| " .show()\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "30e590f4", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+-------+------------------+-------------------+------------------+------------------+---------+\n", | |
| "|summary| sepal_length| sepal_width| petal_length| petal_width| species|\n", | |
| "+-------+------------------+-------------------+------------------+------------------+---------+\n", | |
| "| count| 150| 150| 150| 150| 150|\n", | |
| "| mean| 5.843333333333335| 3.057333333333334|3.7580000000000027| 1.199333333333334| null|\n", | |
| "| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467| null|\n", | |
| "| min| 4.3| 2.0| 1.0| 0.1| setosa|\n", | |
| "| max| 7.9| 4.4| 6.9| 2.5|virginica|\n", | |
| "+-------+------------------+-------------------+------------------+------------------+---------+\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# DataFrameのサマリー\n", | |
| "df_iris.describe().show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "b086ea18", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+---------+---------+-----------+\n", | |
| "|namespace|tableName|isTemporary|\n", | |
| "+---------+---------+-----------+\n", | |
| "| | iris| true|\n", | |
| "+---------+---------+-----------+\n", | |
| "\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
| "| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
| "| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
| "| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
| "| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
| "| 5.4| 3.9| 1.7| 0.4| setosa|\n", | |
| "| 4.6| 3.4| 1.4| 0.3| setosa|\n", | |
| "| 5.0| 3.4| 1.5| 0.2| setosa|\n", | |
| "| 4.4| 2.9| 1.4| 0.2| setosa|\n", | |
| "| 4.9| 3.1| 1.5| 0.1| setosa|\n", | |
| "| 5.4| 3.7| 1.5| 0.2| setosa|\n", | |
| "| 4.8| 3.4| 1.6| 0.2| setosa|\n", | |
| "| 4.8| 3.0| 1.4| 0.1| setosa|\n", | |
| "| 4.3| 3.0| 1.1| 0.1| setosa|\n", | |
| "| 5.8| 4.0| 1.2| 0.2| setosa|\n", | |
| "| 5.7| 4.4| 1.5| 0.4| setosa|\n", | |
| "| 5.4| 3.9| 1.3| 0.4| setosa|\n", | |
| "| 5.1| 3.5| 1.4| 0.3| setosa|\n", | |
| "| 5.7| 3.8| 1.7| 0.3| setosa|\n", | |
| "| 5.1| 3.8| 1.5| 0.3| setosa|\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "only showing top 20 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# SQLでクエリを書くことも可能\n", | |
| "df_iris.createOrReplaceTempView('iris')\n", | |
| "\n", | |
| "spark.sql(\"show tables\").show()\n", | |
| "\n", | |
| "tmp = spark.sql(\"SELECT * FROM iris WHERE species = 'setosa'\")\n", | |
| "tmp.show()" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "37f513cd", | |
| "metadata": {}, | |
| "source": [ | |
| "詳細は[User Guide](https://spark.apache.org/docs/3.3.2/api/python/user_guide/index.html), [API Reference](https://spark.apache.org/docs/3.3.2/api/python/reference/index.html),\n", | |
| "および[Spark SQLのリファレンス](https://spark.apache.org/docs/3.3.2/api/sql/index.html)を参照" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "2ffc3fa2", | |
| "metadata": {}, | |
| "source": [ | |
| "次に、データの[Read/Write](https://spark.apache.org/docs/latest/sql-data-sources.html)を確認する" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "e2ee6092", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pandas.core.frame.DataFrame" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
| "| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
| "| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
| "| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
| "| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
| "| 5.4| 3.9| 1.7| 0.4| setosa|\n", | |
| "| 4.6| 3.4| 1.4| 0.3| setosa|\n", | |
| "| 5.0| 3.4| 1.5| 0.2| setosa|\n", | |
| "| 4.4| 2.9| 1.4| 0.2| setosa|\n", | |
| "| 4.9| 3.1| 1.5| 0.1| setosa|\n", | |
| "| 5.4| 3.7| 1.5| 0.2| setosa|\n", | |
| "| 4.8| 3.4| 1.6| 0.2| setosa|\n", | |
| "| 4.8| 3.0| 1.4| 0.1| setosa|\n", | |
| "| 4.3| 3.0| 1.1| 0.1| setosa|\n", | |
| "| 5.8| 4.0| 1.2| 0.2| setosa|\n", | |
| "| 5.7| 4.4| 1.5| 0.4| setosa|\n", | |
| "| 5.4| 3.9| 1.3| 0.4| setosa|\n", | |
| "| 5.1| 3.5| 1.4| 0.3| setosa|\n", | |
| "| 5.7| 3.8| 1.7| 0.3| setosa|\n", | |
| "| 5.1| 3.8| 1.5| 0.3| setosa|\n", | |
| "+------------+-----------+------------+-----------+-------+\n", | |
| "only showing top 20 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する\n", | |
| "\n", | |
| "display(iris.__class__) # irisはpandas dataframe\n", | |
| "df = spark.createDataFrame(iris)\n", | |
| "\n", | |
| "display(df)\n", | |
| "\n", | |
| "df.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "0dc198ac", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pyspark.sql.dataframe.DataFrame" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>sepal_length</th>\n", | |
| " <th>sepal_width</th>\n", | |
| " <th>petal_length</th>\n", | |
| " <th>petal_width</th>\n", | |
| " <th>species</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>5.1</td>\n", | |
| " <td>3.5</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>4.9</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>4.7</td>\n", | |
| " <td>3.2</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4.6</td>\n", | |
| " <td>3.1</td>\n", | |
| " <td>1.5</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.6</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>145</th>\n", | |
| " <td>6.7</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.2</td>\n", | |
| " <td>2.3</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>146</th>\n", | |
| " <td>6.3</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>1.9</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>147</th>\n", | |
| " <td>6.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.2</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>148</th>\n", | |
| " <td>6.2</td>\n", | |
| " <td>3.4</td>\n", | |
| " <td>5.4</td>\n", | |
| " <td>2.3</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>149</th>\n", | |
| " <td>5.9</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.1</td>\n", | |
| " <td>1.8</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>150 rows × 5 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " sepal_length sepal_width petal_length petal_width species\n", | |
| "0 5.1 3.5 1.4 0.2 setosa\n", | |
| "1 4.9 3.0 1.4 0.2 setosa\n", | |
| "2 4.7 3.2 1.3 0.2 setosa\n", | |
| "3 4.6 3.1 1.5 0.2 setosa\n", | |
| "4 5.0 3.6 1.4 0.2 setosa\n", | |
| ".. ... ... ... ... ...\n", | |
| "145 6.7 3.0 5.2 2.3 virginica\n", | |
| "146 6.3 2.5 5.0 1.9 virginica\n", | |
| "147 6.5 3.0 5.2 2.0 virginica\n", | |
| "148 6.2 3.4 5.4 2.3 virginica\n", | |
| "149 5.9 3.0 5.1 1.8 virginica\n", | |
| "\n", | |
| "[150 rows x 5 columns]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "# spark dataframeからpandas dataframeへの変換\n", | |
| "\n", | |
| "display(df.__class__) # pyspark.sql.dataframe.DataFrame\n", | |
| "\n", | |
| "pdf = df.toPandas()\n", | |
| "display(pdf)" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "ee45272f", | |
| "metadata": {}, | |
| "source": [ | |
| "↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったら`toPandas`をして扱いやすいpandasに変換する、という使い方が出来る。" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "9d02c4e4", | |
| "metadata": {}, | |
| "source": [ | |
| "最後に色々な形式でデータの書き込みと読み込みを行う\n", | |
| "\n", | |
| "先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "661fc7b0", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "_SUCCESS\n", | |
| "part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n", | |
| "part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# dfはspark dataframe\n", | |
| "\n", | |
| "# csv.gz\n", | |
| "df.write.save(\"testdata/iris_csv\", format=\"csv\")\n", | |
| "\n", | |
| "# formatオプションを使う代わりに、↓のように書いてもOK\n", | |
| "# df.write.csv(\"testdata/iris_csv\", compression=\"gzip\")\n", | |
| "\n", | |
| "!ls testdata/iris_csv" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "a0508de1", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "5.1,3.5,1.4,0.2,setosa\n", | |
| "4.9,3.0,1.4,0.2,setosa\n", | |
| "4.7,3.2,1.3,0.2,setosa\n", | |
| "4.6,3.1,1.5,0.2,setosa\n", | |
| "5.0,3.6,1.4,0.2,setosa\n", | |
| "5.4,3.9,1.7,0.4,setosa\n", | |
| "4.6,3.4,1.4,0.3,setosa\n", | |
| "5.0,3.4,1.5,0.2,setosa\n", | |
| "4.4,2.9,1.4,0.2,setosa\n", | |
| "4.9,3.1,1.5,0.1,setosa\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!head testdata/iris_csv/part-00000-*.csv" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "5a2cbc62", | |
| "metadata": {}, | |
| "source": [ | |
| "↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。\n", | |
| "(ファイルが分散して生成される)\n", | |
| "\n", | |
| "読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、\n", | |
| "**基本は保存したディレクトリを指定して読み込む**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "5953ef8e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+---+---+---+---+----------+\n", | |
| "|_c0|_c1|_c2|_c3| _c4|\n", | |
| "+---+---+---+---+----------+\n", | |
| "|6.3|3.3|4.7|1.6|versicolor|\n", | |
| "|4.9|2.4|3.3|1.0|versicolor|\n", | |
| "|6.6|2.9|4.6|1.3|versicolor|\n", | |
| "+---+---+---+---+----------+\n", | |
| "only showing top 3 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df_csv = spark.read.load(\"testdata/iris_csv\", format=\"csv\")\n", | |
| "\n", | |
| "# formatを指定せず、↓のようにすることも可能\n", | |
| "# df_csv = spark.read.csv(\"testdata/iris_csv\")\n", | |
| "\n", | |
| "df_csv.show(3)" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "f7c66c47", | |
| "metadata": {}, | |
| "source": [ | |
| "(↑csvだとカラム名などの情報が欠落しがちで使いづらい)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "65039664", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "_SUCCESS\n", | |
| "part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n", | |
| "part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# テキスト形式だとcsvの他にjsonなども指定可能\n", | |
| "# compressionで圧縮形式を指定することも出来る\n", | |
| "\n", | |
| "df.write.json(\"testdata/iris_json\", compression='gzip')\n", | |
| "\n", | |
| "!ls testdata/iris_json" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "id": "6245160d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "|petal_length|petal_width|sepal_length|sepal_width| species|\n", | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "| 3.3| 1.0| 5.0| 2.3|versicolor|\n", | |
| "| 4.2| 1.3| 5.6| 2.7|versicolor|\n", | |
| "| 4.2| 1.2| 5.7| 3.0|versicolor|\n", | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "only showing top 3 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 読み込みも同様\n", | |
| "\n", | |
| "df_json = spark.read.json(\"testdata/iris_json\")\n", | |
| "# or df_json = spark.read.load(\"testdata/iris_json\", format=\"json\")\n", | |
| "\n", | |
| "display(df_json)\n", | |
| "df_json.show(3)" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "04a92179", | |
| "metadata": {}, | |
| "source": [ | |
| "(↑jsonだとcsvよりは型情報が保持される)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "id": "3151765f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory\n", | |
| "Scaling row group sizes to 95.00% for 8 writers\n", | |
| " \r" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "_SUCCESS\n", | |
| "part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n", | |
| "_SUCCESS\n", | |
| "part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n", | |
| "part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など\n", | |
| "\n", | |
| "df.write.parquet(\"testdata/iris_parquet\", compression=\"snappy\")\n", | |
| "!ls testdata/iris_parquet\n", | |
| "\n", | |
| "df.write.orc(\"testdata/iris_orc\", compression=\"zlib\")\n", | |
| "!ls testdata/iris_orc" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "id": "4cb05940", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "parquet:\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "| 5.0| 2.3| 3.3| 1.0|versicolor|\n", | |
| "| 5.6| 2.7| 4.2| 1.3|versicolor|\n", | |
| "| 5.7| 3.0| 4.2| 1.2|versicolor|\n", | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "only showing top 3 rows\n", | |
| "\n", | |
| "\n", | |
| "------------------\n", | |
| "\n", | |
| "orc:\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "| 5.0| 2.3| 3.3| 1.0|versicolor|\n", | |
| "| 5.6| 2.7| 4.2| 1.3|versicolor|\n", | |
| "| 5.7| 3.0| 4.2| 1.2|versicolor|\n", | |
| "+------------+-----------+------------+-----------+----------+\n", | |
| "only showing top 3 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 読み込み\n", | |
| "\n", | |
| "# parquet\n", | |
| "print(\"parquet:\")\n", | |
| "df_parquet = spark.read.parquet(\"testdata/iris_parquet\")\n", | |
| "display(df_parquet)\n", | |
| "df_parquet.show(3)\n", | |
| "\n", | |
| "print(\"\\n------------------\\n\")\n", | |
| "\n", | |
| "# orc\n", | |
| "print(\"orc:\")\n", | |
| "df_orc = spark.read.orc(\"testdata/iris_orc\")\n", | |
| "display(df_orc)\n", | |
| "df_orc.show(3)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "e2c0ff8b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[00;38;5;33m.\u001b[0m\n", | |
| "├── _SUCCESS\n", | |
| "├── \u001b[00;38;5;33mspecies=setosa\u001b[0m\n", | |
| "│ ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "│ ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "│ └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "├── \u001b[00;38;5;33mspecies=versicolor\u001b[0m\n", | |
| "│ ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "│ ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "│ ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "│ └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "└── \u001b[00;38;5;33mspecies=virginica\u001b[0m\n", | |
| " ├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| " ├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| " └── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n", | |
| "\n", | |
| "4 directories, 11 files\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# partitionを切って保存することも可能\n", | |
| "\n", | |
| "df.write.save(\n", | |
| " \"testdata/iris_with_partition\",\n", | |
| " format=\"parquet\",\n", | |
| " compression=\"snappy\",\n", | |
| " partitionBy=\"species\"\n", | |
| " )\n", | |
| "\n", | |
| "!cd testdata/iris_with_partition && tree" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "51fcefd8", | |
| "metadata": {}, | |
| "source": [ | |
| "カラム\"species\"の値ごとに別ディレクトリが切られてデータが保存される\n", | |
| "\n", | |
| "RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る(し、不適切だと悪化する)\n", | |
| "\n", | |
| "↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。\n", | |
| "一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "id": "4a458cb4", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+------------+-----------+------------+-----------+---------+\n", | |
| "|sepal_length|sepal_width|petal_length|petal_width| species|\n", | |
| "+------------+-----------+------------+-----------+---------+\n", | |
| "| 7.9| 3.8| 6.4| 2.0|virginica|\n", | |
| "| 6.4| 2.8| 5.6| 2.2|virginica|\n", | |
| "| 6.3| 2.8| 5.1| 1.5|virginica|\n", | |
| "+------------+-----------+------------+-----------+---------+\n", | |
| "only showing top 3 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# パーティションがある場合でも読み込み方法は同様\n", | |
| "\n", | |
| "df_partition = spark.read.load(\"testdata/iris_with_partition\")\n", | |
| "\n", | |
| "display(df_partition)\n", | |
| "df_partition.show(3)" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "376cf4e6", | |
| "metadata": {}, | |
| "source": [ | |
| "他にも[saveAsTable](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrameWriter.saveAsTable.html)で永続化したテーブルとして書き込みを行ったりも出来る。\n", | |
| "\n", | |
| "扱えるデータ保存形式も[Data Sources](https://spark.apache.org/docs/latest/sql-data-sources.html)のように、\n", | |
| "他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "id": "77eb272b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", | |
| " warnings.warn(\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import pyspark.pandas as ps" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "7990e97a", | |
| "metadata": {}, | |
| "source": [ | |
| "## (補足)pysparkにおけるpandas API\n", | |
| "\n", | |
| "参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html\n", | |
| "\n", | |
| "最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子\n", | |
| "\n", | |
| "なお、[Koalas](https://github.com/databricks/koalas)という、sparkをpandasっぽく使えるツールが存在するが、\n", | |
| "[ここ](https://learn.microsoft.com/ja-jp/azure/databricks/archive/legacy/koalas)などを見るとこれが正式にpysparkに取り込まれた?様子。" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "id": "27553982", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.\n", | |
| " warnings.warn(message, PandasAPIOnSparkAdviceWarning)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pyspark.pandas.frame.DataFrame" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>sepal_length</th>\n", | |
| " <th>sepal_width</th>\n", | |
| " <th>petal_length</th>\n", | |
| " <th>petal_width</th>\n", | |
| " <th>species</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>5.1</td>\n", | |
| " <td>3.5</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>4.9</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>4.7</td>\n", | |
| " <td>3.2</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4.6</td>\n", | |
| " <td>3.1</td>\n", | |
| " <td>1.5</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.6</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " sepal_length sepal_width petal_length petal_width species\n", | |
| "0 5.1 3.5 1.4 0.2 setosa\n", | |
| "1 4.9 3.0 1.4 0.2 setosa\n", | |
| "2 4.7 3.2 1.3 0.2 setosa\n", | |
| "3 4.6 3.1 1.5 0.2 setosa\n", | |
| "4 5.0 3.6 1.4 0.2 setosa" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import pyspark.pandas as ps\n", | |
| "\n", | |
| "# pandas on spark DataFrameを作成する\n", | |
| "psdf = ps.read_csv(\"testdata/iris.csv\")\n", | |
| "\n", | |
| "display(psdf.__class__) # pyspark.pandas.frame.DataFrame\n", | |
| "\n", | |
| "psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "id": "5dda2325", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>sepal_length</th>\n", | |
| " <th>sepal_width</th>\n", | |
| " <th>petal_length</th>\n", | |
| " <th>petal_width</th>\n", | |
| " <th>species</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>105</th>\n", | |
| " <td>7.6</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>6.6</td>\n", | |
| " <td>2.1</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>117</th>\n", | |
| " <td>7.7</td>\n", | |
| " <td>3.8</td>\n", | |
| " <td>6.7</td>\n", | |
| " <td>2.2</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>118</th>\n", | |
| " <td>7.7</td>\n", | |
| " <td>2.6</td>\n", | |
| " <td>6.9</td>\n", | |
| " <td>2.3</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>122</th>\n", | |
| " <td>7.7</td>\n", | |
| " <td>2.8</td>\n", | |
| " <td>6.7</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>131</th>\n", | |
| " <td>7.9</td>\n", | |
| " <td>3.8</td>\n", | |
| " <td>6.4</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>135</th>\n", | |
| " <td>7.7</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>6.1</td>\n", | |
| " <td>2.3</td>\n", | |
| " <td>virginica</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " sepal_length sepal_width petal_length petal_width species\n", | |
| "105 7.6 3.0 6.6 2.1 virginica\n", | |
| "117 7.7 3.8 6.7 2.2 virginica\n", | |
| "118 7.7 2.6 6.9 2.3 virginica\n", | |
| "122 7.7 2.8 6.7 2.0 virginica\n", | |
| "131 7.9 3.8 6.4 2.0 virginica\n", | |
| "135 7.7 3.0 6.1 2.3 virginica" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# filterや表示も通常のpandasのようにできている\n", | |
| "psdf[psdf['sepal_length'] > 7.5]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "a33ed92b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "+-----+------------+-----------+------------+-----------+-------+\n", | |
| "|index|sepal_length|sepal_width|petal_length|petal_width|species|\n", | |
| "+-----+------------+-----------+------------+-----------+-------+\n", | |
| "| 0| 5.1| 3.5| 1.4| 0.2| setosa|\n", | |
| "| 1| 4.9| 3.0| 1.4| 0.2| setosa|\n", | |
| "| 2| 4.7| 3.2| 1.3| 0.2| setosa|\n", | |
| "| 3| 4.6| 3.1| 1.5| 0.2| setosa|\n", | |
| "| 4| 5.0| 3.6| 1.4| 0.2| setosa|\n", | |
| "+-----+------------+-----------+------------+-----------+-------+\n", | |
| "only showing top 5 rows\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# pysparkのDataFrameに変換\n", | |
| "\n", | |
| "sdf = psdf.to_spark(index_col=[\"index\"])\n", | |
| "\n", | |
| "sdf.show(5)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "id": "69a33023", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.\n", | |
| " warnings.warn(message, PandasAPIOnSparkAdviceWarning)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pandas.core.frame.DataFrame" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>sepal_length</th>\n", | |
| " <th>sepal_width</th>\n", | |
| " <th>petal_length</th>\n", | |
| " <th>petal_width</th>\n", | |
| " <th>species</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>5.1</td>\n", | |
| " <td>3.5</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>4.9</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>4.7</td>\n", | |
| " <td>3.2</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4.6</td>\n", | |
| " <td>3.1</td>\n", | |
| " <td>1.5</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.6</td>\n", | |
| " <td>1.4</td>\n", | |
| " <td>0.2</td>\n", | |
| " <td>setosa</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " sepal_length sepal_width petal_length petal_width species\n", | |
| "0 5.1 3.5 1.4 0.2 setosa\n", | |
| "1 4.9 3.0 1.4 0.2 setosa\n", | |
| "2 4.7 3.2 1.3 0.2 setosa\n", | |
| "3 4.6 3.1 1.5 0.2 setosa\n", | |
| "4 5.0 3.6 1.4 0.2 setosa" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# 普通のpandasに変換\n", | |
| "pdf = psdf.to_pandas()\n", | |
| "\n", | |
| "display(pdf.__class__) # pandas.core.frame.DataFrame\n", | |
| "\n", | |
| "pdf.head()" | |
| ] | |
| }, | |
| { | |
| "attachments": {}, | |
| "cell_type": "markdown", | |
| "id": "cfb09aa9", | |
| "metadata": {}, | |
| "source": [ | |
| "## (参考) [dask](https://www.dask.org/)\n", | |
| "\n", | |
| "pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。\n", | |
| "\n", | |
| "例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、\n", | |
| "データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。\n", | |
| "\n", | |
| "pysparkはdaskに比べると、\n", | |
| "\n", | |
| "- クラスターなどを組める分、より本格的なビッグデータにも対応出来る\n", | |
| " - (今回紹介するようなやり方で、ライトに使うことも可能)\n", | |
| "- Javaをバックエンドに動くことに加え、本格的に動かす場合はクラスター構築などが必要になるため、そのレベルでやると動作環境の構築のハードルは高い\n", | |
| "- pandas、numpyとの親和性はdaskの方が~~高い~~高かった\n", | |
| " - daskは遅延評価される以外は同じメソッドをそのまま使えるケースが多い\n", | |
| " - sparkはむしろSQLと互換性がある\n", | |
| " - ただ、先述の通りpandas APIが整備されつつあるので、pysparkもある程度pandasの使い方そのままで使えるようになっている模様" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、 メモリに乗り切らないレベルのデータ量は処理できない。 また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。
遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、 更にpandasとの連携・併用ができるツールとしてpysparkが存在するため紹介する。
なお、詳細はPySpark Documentationなども参照のこと
Mambaforgeをインストール済みのLinux環境を前提とする。
以下のようなconda仮想環境作成用のyamlを用意する:
# pyspark_env.yaml
name: pyspark-intro
channels:
- conda-forge
dependencies:
- python=3.11
- pyspark>=3.4
- openjdk=17
- pandas>=2.0.1
- pyarrow
# for testdata
- numpy
- seaborn
# jupyter
- jupyterlab※先日pandasの2.0がリリースされたが、
deprecatedになっていた
iteritemsが削除された影響で、
pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。pyspark>=3.4.0で修正済みなので、pandasも2系を使う
mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)
mamba env create -f pyspark_env.yaml
# 出来上がった環境(pyspark-intro)をactivate
conda activate pyspark-intro
# or `mamba activate pyspark-intro`あたりを簡単に確認する。
データ処理周りの詳細は省略するが、Sparkのデータフレーム機能がSpark SQLと対応していることから、 概ねSQLで出来ることようなことを実行出来ると考えればOK。(というかSQLでも書ける)
# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)
!rm -rf testdata && mkdir testdatafrom pyspark.sql.session import SparkSession
# spark sessionの立ち上げ
spark = (SparkSession.builder
.master("local[*]")
.appName("LocalTest")
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.config("spark.executor.memory", "4g") # TMP (実際はハードコートはあまり良くない)
.getOrCreate()
)
sparkyour 131072x1 screen size is bogus. expect trouble
23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)
23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
<div>
<p><b>SparkSession - in-memory</b></p>
SparkContext
<p><a href="http://172.24.210.1:4040">Spark UI</a></p>
<dl>
<dt>Version</dt>
<dd><code>v3.4.0</code></dd>
<dt>Master</dt>
<dd><code>local[*]</code></dd>
<dt>AppName</dt>
<dd><code>LocalTest</code></dd>
</dl>
</div>
↑config部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)
参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion
なお、詳細かつ環境依存性のあるconfigは別途configファイル($SPARK_HOME/conf/spark-defaults.conf)を作成するなどして設定すると良い
コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html
また、コンフィグファイルの置き場所の設定などに必要な環境変数SPARK_HOMEについて、condaでインストールした場合の設定方法などはここなどを参照
Spark Sessionを起動すると、デフォルトではhttp://localhost:4040にSpark UIが立ち上がり、 実行状態などをモニタリングすることが出来る(デバッグなどにも便利)
# テストデータの作成
import seaborn as sns
iris = sns.load_dataset("iris")
display(iris)
iris.to_csv("testdata/iris.csv", index=False).dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
# pysparkでcsvを読み込み
df_iris = spark.read.csv(
"testdata/iris.csv",
header=True,
inferSchema=True,
)
# 遅延評価するので、カラムのスキーマのみ表示される
display(df_iris)DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)
※(補足):
# showなどによって初めて評価が行われる
df_iris.show(5) # 5件を表示+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# selectによるカラムの選択
(df_iris
.select(["sepal_length", "sepal_width"])
.show(3)
)+------------+-----------+
|sepal_length|sepal_width|
+------------+-----------+
| 5.1| 3.5|
| 4.9| 3.0|
| 4.7| 3.2|
+------------+-----------+
only showing top 3 rows
# filterによる簡単なクエリ
(df_iris
.filter(df_iris.species == "virginica")
.filter(df_iris.sepal_length > 5.0)
.show(3)
)+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 6.3| 3.3| 6.0| 2.5|virginica|
| 5.8| 2.7| 5.1| 1.9|virginica|
| 7.1| 3.0| 5.9| 2.1|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
# groupByによる集約(1)
(df_iris
.groupBy('species')
.count()
.show()
)+----------+-----+
| species|count|
+----------+-----+
| virginica| 50|
|versicolor| 50|
| setosa| 50|
+----------+-----+
# groupByによる集約(2)
(df_iris
.groupBy('species')
.agg({
'sepal_length': 'mean',
'sepal_width': 'mean',
'petal_length': 'mean',
'petal_width': 'mean',
})
.show()
)+----------+------------------+------------------+-----------------+------------------+
| species| avg(sepal_width)| avg(petal_width)|avg(sepal_length)| avg(petal_length)|
+----------+------------------+------------------+-----------------+------------------+
| virginica|2.9739999999999998| 2.026|6.587999999999998| 5.552|
|versicolor|2.7700000000000005|1.3259999999999998| 5.936| 4.26|
| setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|
+----------+------------------+------------------+-----------------+------------------+
# DataFrameのサマリー
df_iris.describe().show()23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-------+------------------+-------------------+------------------+------------------+---------+
|summary| sepal_length| sepal_width| petal_length| petal_width| species|
+-------+------------------+-------------------+------------------+------------------+---------+
| count| 150| 150| 150| 150| 150|
| mean| 5.843333333333335| 3.057333333333334|3.7580000000000027| 1.199333333333334| null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467| null|
| min| 4.3| 2.0| 1.0| 0.1| setosa|
| max| 7.9| 4.4| 6.9| 2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+
# SQLでクエリを書くことも可能
df_iris.createOrReplaceTempView('iris')
spark.sql("show tables").show()
tmp = spark.sql("SELECT * FROM iris WHERE species = 'setosa'")
tmp.show()+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
| | iris| true|
+---------+---------+-----------+
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
詳細はUser Guide, API Reference, およびSpark SQLのリファレンスを参照
次に、データのRead/Writeを確認する
# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する
display(iris.__class__) # irisはpandas dataframe
df = spark.createDataFrame(iris)
display(df)
df.show()pandas.core.frame.DataFrame
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
# spark dataframeからpandas dataframeへの変換
display(df.__class__) # pyspark.sql.dataframe.DataFrame
pdf = df.toPandas()
display(pdf)pyspark.sql.dataframe.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったらtoPandasをして扱いやすいpandasに変換する、という使い方が出来る。
最後に色々な形式でデータの書き込みと読み込みを行う
先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。
# dfはspark dataframe
# csv.gz
df.write.save("testdata/iris_csv", format="csv")
# formatオプションを使う代わりに、↓のように書いてもOK
# df.write.csv("testdata/iris_csv", compression="gzip")
!ls testdata/iris_csv_SUCCESS
part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
!head testdata/iris_csv/part-00000-*.csv5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。 (ファイルが分散して生成される)
読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、 基本は保存したディレクトリを指定して読み込む
df_csv = spark.read.load("testdata/iris_csv", format="csv")
# formatを指定せず、↓のようにすることも可能
# df_csv = spark.read.csv("testdata/iris_csv")
df_csv.show(3)+---+---+---+---+----------+
|_c0|_c1|_c2|_c3| _c4|
+---+---+---+---+----------+
|6.3|3.3|4.7|1.6|versicolor|
|4.9|2.4|3.3|1.0|versicolor|
|6.6|2.9|4.6|1.3|versicolor|
+---+---+---+---+----------+
only showing top 3 rows
(↑csvだとカラム名などの情報が欠落しがちで使いづらい)
# テキスト形式だとcsvの他にjsonなども指定可能
# compressionで圧縮形式を指定することも出来る
df.write.json("testdata/iris_json", compression='gzip')
!ls testdata/iris_json_SUCCESS
part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
# 読み込みも同様
df_json = spark.read.json("testdata/iris_json")
# or df_json = spark.read.load("testdata/iris_json", format="json")
display(df_json)
df_json.show(3)DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|petal_length|petal_width|sepal_length|sepal_width| species|
+------------+-----------+------------+-----------+----------+
| 3.3| 1.0| 5.0| 2.3|versicolor|
| 4.2| 1.3| 5.6| 2.7|versicolor|
| 4.2| 1.2| 5.7| 3.0|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
(↑jsonだとcsvよりは型情報が保持される)
# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など
df.write.parquet("testdata/iris_parquet", compression="snappy")
!ls testdata/iris_parquet
df.write.orc("testdata/iris_orc", compression="zlib")
!ls testdata/iris_orc23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
_SUCCESS
part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
_SUCCESS
part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
# 読み込み
# parquet
print("parquet:")
df_parquet = spark.read.parquet("testdata/iris_parquet")
display(df_parquet)
df_parquet.show(3)
print("\n------------------\n")
# orc
print("orc:")
df_orc = spark.read.orc("testdata/iris_orc")
display(df_orc)
df_orc.show(3)parquet:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
------------------
orc:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
# partitionを切って保存することも可能
df.write.save(
"testdata/iris_with_partition",
format="parquet",
compression="snappy",
partitionBy="species"
)
!cd testdata/iris_with_partition && tree�[00;38;5;33m.�[0m
├── _SUCCESS
├── �[00;38;5;33mspecies=setosa�[0m
│ ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── �[00;38;5;33mspecies=versicolor�[0m
│ ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── �[00;38;5;33mspecies=virginica�[0m
├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
4 directories, 11 files
カラム"species"の値ごとに別ディレクトリが切られてデータが保存される
RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る(し、不適切だと悪化する)
↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。 一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。
# パーティションがある場合でも読み込み方法は同様
df_partition = spark.read.load("testdata/iris_with_partition")
display(df_partition)
df_partition.show(3)DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 7.9| 3.8| 6.4| 2.0|virginica|
| 6.4| 2.8| 5.6| 2.2|virginica|
| 6.3| 2.8| 5.1| 1.5|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
他にもsaveAsTableで永続化したテーブルとして書き込みを行ったりも出来る。
扱えるデータ保存形式もData Sourcesのように、 他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。
import pyspark.pandas as ps/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.
warnings.warn(
参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html
最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子
なお、Koalasという、sparkをpandasっぽく使えるツールが存在するが、 ここなどを見るとこれが正式にpysparkに取り込まれた?様子。
import pyspark.pandas as ps
# pandas on spark DataFrameを作成する
psdf = ps.read_csv("testdata/iris.csv")
display(psdf.__class__) # pyspark.pandas.frame.DataFrame
psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pyspark.pandas.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# filterや表示も通常のpandasのようにできている
psdf[psdf['sepal_length'] > 7.5].dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 105 | 7.6 | 3.0 | 6.6 | 2.1 | virginica |
| 117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
| 118 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
| 122 | 7.7 | 2.8 | 6.7 | 2.0 | virginica |
| 131 | 7.9 | 3.8 | 6.4 | 2.0 | virginica |
| 135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
# pysparkのDataFrameに変換
sdf = psdf.to_spark(index_col=["index"])
sdf.show(5)+-----+------------+-----------+------------+-----------+-------+
|index|sepal_length|sepal_width|petal_length|petal_width|species|
+-----+------------+-----------+------------+-----------+-------+
| 0| 5.1| 3.5| 1.4| 0.2| setosa|
| 1| 4.9| 3.0| 1.4| 0.2| setosa|
| 2| 4.7| 3.2| 1.3| 0.2| setosa|
| 3| 4.6| 3.1| 1.5| 0.2| setosa|
| 4| 5.0| 3.6| 1.4| 0.2| setosa|
+-----+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# 普通のpandasに変換
pdf = psdf.to_pandas()
display(pdf.__class__) # pandas.core.frame.DataFrame
pdf.head()/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pandas.core.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。
例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、 データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。
pysparkはdaskに比べると、
| name: pyspark-intro | |
| channels: | |
| - conda-forge | |
| dependencies: | |
| - python=3.11 | |
| - pyspark>=3.4 | |
| - openjdk=17 | |
| - pandas>=2.0.1 | |
| - pyarrow | |
| # for testdata | |
| - numpy | |
| - seaborn | |
| # jupyter | |
| - jupyterlab |
| name: pyspark-intro | |
| channels: | |
| - conda-forge | |
| dependencies: | |
| - _libgcc_mutex=0.1=conda_forge | |
| - _openmp_mutex=4.5=2_gnu | |
| - aiofiles=22.1.0=pyhd8ed1ab_0 | |
| - aiosqlite=0.18.0=pyhd8ed1ab_0 | |
| - alsa-lib=1.2.8=h166bdaf_0 | |
| - anyio=3.6.2=pyhd8ed1ab_0 | |
| - argon2-cffi=21.3.0=pyhd8ed1ab_0 | |
| - argon2-cffi-bindings=21.2.0=py311hd4cff14_3 | |
| - arrow-cpp=11.0.0=ha770c72_14_cpu | |
| - asttokens=2.2.1=pyhd8ed1ab_0 | |
| - attrs=22.2.0=pyh71513ae_0 | |
| - aws-c-auth=0.6.26=hf365957_1 | |
| - aws-c-cal=0.5.21=h48707d8_2 | |
| - aws-c-common=0.8.14=h0b41bf4_0 | |
| - aws-c-compression=0.2.16=h03acc5a_5 | |
| - aws-c-event-stream=0.2.20=h00877a2_4 | |
| - aws-c-http=0.7.6=hf342b9f_0 | |
| - aws-c-io=0.13.19=h5b20300_3 | |
| - aws-c-mqtt=0.8.6=hc4349f7_12 | |
| - aws-c-s3=0.2.7=h909e904_1 | |
| - aws-c-sdkutils=0.1.8=h03acc5a_0 | |
| - aws-checksums=0.1.14=h03acc5a_5 | |
| - aws-crt-cpp=0.19.8=hf7fbfca_12 | |
| - aws-sdk-cpp=1.10.57=h17c43bd_8 | |
| - babel=2.12.1=pyhd8ed1ab_1 | |
| - backcall=0.2.0=pyh9f0ad1d_0 | |
| - backports=1.0=pyhd8ed1ab_3 | |
| - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 | |
| - beautifulsoup4=4.12.2=pyha770c72_0 | |
| - bleach=6.0.0=pyhd8ed1ab_0 | |
| - brotli=1.0.9=h166bdaf_8 | |
| - brotli-bin=1.0.9=h166bdaf_8 | |
| - brotlipy=0.7.0=py311hd4cff14_1005 | |
| - bzip2=1.0.8=h7f98852_4 | |
| - c-ares=1.18.1=h7f98852_0 | |
| - ca-certificates=2022.12.7=ha878542_0 | |
| - cairo=1.16.0=h35add3b_1015 | |
| - certifi=2022.12.7=pyhd8ed1ab_0 | |
| - cffi=1.15.1=py311h409f033_3 | |
| - charset-normalizer=3.1.0=pyhd8ed1ab_0 | |
| - comm=0.1.3=pyhd8ed1ab_0 | |
| - contourpy=1.0.7=py311ha3edf6b_0 | |
| - cryptography=40.0.2=py311h9b4c7bb_0 | |
| - cycler=0.11.0=pyhd8ed1ab_0 | |
| - debugpy=1.6.7=py311hcafe171_0 | |
| - decorator=5.1.1=pyhd8ed1ab_0 | |
| - defusedxml=0.7.1=pyhd8ed1ab_0 | |
| - entrypoints=0.4=pyhd8ed1ab_0 | |
| - executing=1.2.0=pyhd8ed1ab_0 | |
| - expat=2.5.0=hcb278e6_1 | |
| - flit-core=3.8.0=pyhd8ed1ab_0 | |
| - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 | |
| - font-ttf-inconsolata=3.000=h77eed37_0 | |
| - font-ttf-source-code-pro=2.038=h77eed37_0 | |
| - font-ttf-ubuntu=0.83=hab24e00_0 | |
| - fontconfig=2.14.2=h14ed4e7_0 | |
| - fonts-conda-ecosystem=1=0 | |
| - fonts-conda-forge=1=0 | |
| - fonttools=4.39.3=py311h2582759_0 | |
| - freetype=2.12.1=hca18f0e_1 | |
| - gettext=0.21.1=h27087fc_0 | |
| - gflags=2.2.2=he1b5a44_1004 | |
| - giflib=5.2.1=h0b41bf4_3 | |
| - glog=0.6.0=h6f12383_0 | |
| - graphite2=1.3.13=h58526e2_1001 | |
| - harfbuzz=6.0.0=h3ff4399_1 | |
| - icu=72.1=hcb278e6_0 | |
| - idna=3.4=pyhd8ed1ab_0 | |
| - importlib-metadata=6.6.0=pyha770c72_0 | |
| - importlib_metadata=6.6.0=hd8ed1ab_0 | |
| - importlib_resources=5.12.0=pyhd8ed1ab_0 | |
| - ipykernel=6.22.0=pyh210e3f2_0 | |
| - ipython=8.12.0=pyh41d4057_0 | |
| - ipython_genutils=0.2.0=py_1 | |
| - jedi=0.18.2=pyhd8ed1ab_0 | |
| - jinja2=3.1.2=pyhd8ed1ab_1 | |
| - json5=0.9.5=pyh9f0ad1d_0 | |
| - jsonschema=4.17.3=pyhd8ed1ab_0 | |
| - jupyter_client=8.2.0=pyhd8ed1ab_0 | |
| - jupyter_core=5.3.0=py311h38be061_0 | |
| - jupyter_events=0.6.3=pyhd8ed1ab_0 | |
| - jupyter_server=2.5.0=pyhd8ed1ab_0 | |
| - jupyter_server_fileid=0.9.0=pyhd8ed1ab_0 | |
| - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1 | |
| - jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0 | |
| - jupyter_ydoc=0.2.3=pyhd8ed1ab_0 | |
| - jupyterlab=3.6.3=pyhd8ed1ab_0 | |
| - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 | |
| - jupyterlab_server=2.22.1=pyhd8ed1ab_0 | |
| - keyutils=1.6.1=h166bdaf_0 | |
| - kiwisolver=1.4.4=py311h4dd048b_1 | |
| - krb5=1.20.1=h81ceb04_0 | |
| - lcms2=2.15=haa2dc70_1 | |
| - ld_impl_linux-64=2.40=h41732ed_0 | |
| - lerc=4.0.0=h27087fc_0 | |
| - libabseil=20230125.0=cxx17_hcb278e6_1 | |
| - libarrow=11.0.0=h93537a5_14_cpu | |
| - libblas=3.9.0=16_linux64_openblas | |
| - libbrotlicommon=1.0.9=h166bdaf_8 | |
| - libbrotlidec=1.0.9=h166bdaf_8 | |
| - libbrotlienc=1.0.9=h166bdaf_8 | |
| - libcblas=3.9.0=16_linux64_openblas | |
| - libcrc32c=1.1.2=h9c3ff4c_0 | |
| - libcups=2.3.3=h36d4200_3 | |
| - libcurl=8.0.1=h588be90_0 | |
| - libdeflate=1.18=h0b41bf4_0 | |
| - libedit=3.1.20191231=he28a2e2_2 | |
| - libev=4.33=h516909a_1 | |
| - libevent=2.1.10=h28343ad_4 | |
| - libexpat=2.5.0=hcb278e6_1 | |
| - libffi=3.4.2=h7f98852_5 | |
| - libgcc-ng=12.2.0=h65d4601_19 | |
| - libgfortran-ng=12.2.0=h69a702a_19 | |
| - libgfortran5=12.2.0=h337968e_19 | |
| - libglib=2.76.2=hebfc3b9_0 | |
| - libgomp=12.2.0=h65d4601_19 | |
| - libgoogle-cloud=2.8.0=h0bc5f78_1 | |
| - libgrpc=1.52.1=hcf146ea_1 | |
| - libiconv=1.17=h166bdaf_0 | |
| - libjpeg-turbo=2.1.5.1=h0b41bf4_0 | |
| - liblapack=3.9.0=16_linux64_openblas | |
| - libnghttp2=1.52.0=h61bc06f_0 | |
| - libnsl=2.0.0=h7f98852_0 | |
| - libnuma=2.0.16=h0b41bf4_1 | |
| - libopenblas=0.3.21=pthreads_h78a6416_3 | |
| - libpng=1.6.39=h753d276_0 | |
| - libprotobuf=3.21.12=h3eb15da_0 | |
| - libsodium=1.0.18=h36c2ea0_1 | |
| - libsqlite=3.40.0=h753d276_1 | |
| - libssh2=1.10.0=hf14f497_3 | |
| - libstdcxx-ng=12.2.0=h46fd767_19 | |
| - libthrift=0.18.1=h5e4af38_0 | |
| - libtiff=4.5.0=ha587672_6 | |
| - libutf8proc=2.8.0=h166bdaf_0 | |
| - libuuid=2.38.1=h0b41bf4_0 | |
| - libwebp-base=1.3.0=h0b41bf4_0 | |
| - libxcb=1.13=h7f98852_1004 | |
| - libzlib=1.2.13=h166bdaf_4 | |
| - lz4-c=1.9.4=hcb278e6_0 | |
| - markupsafe=2.1.2=py311h2582759_0 | |
| - matplotlib-base=3.7.1=py311h8597a09_0 | |
| - matplotlib-inline=0.1.6=pyhd8ed1ab_0 | |
| - mistune=2.0.5=pyhd8ed1ab_0 | |
| - munkres=1.1.4=pyh9f0ad1d_0 | |
| - nbclassic=0.5.5=pyhb4ecaf3_1 | |
| - nbclient=0.7.3=pyhd8ed1ab_0 | |
| - nbconvert=7.3.1=pyhd8ed1ab_0 | |
| - nbconvert-core=7.3.1=pyhd8ed1ab_0 | |
| - nbconvert-pandoc=7.3.1=pyhd8ed1ab_0 | |
| - nbformat=5.8.0=pyhd8ed1ab_0 | |
| - ncurses=6.3=h27087fc_1 | |
| - nest-asyncio=1.5.6=pyhd8ed1ab_0 | |
| - notebook=6.5.4=pyha770c72_0 | |
| - notebook-shim=0.2.2=pyhd8ed1ab_0 | |
| - numpy=1.24.3=py311h64a7726_0 | |
| - openjdk=17.0.3=h4335b31_6 | |
| - openjpeg=2.5.0=hfec8fc6_2 | |
| - openssl=3.1.0=hd590300_1 | |
| - orc=1.8.3=hfdbbad2_0 | |
| - packaging=23.1=pyhd8ed1ab_0 | |
| - pandas=2.0.1=py311h320fe9a_0 | |
| - pandoc=2.19.2=h32600fe_2 | |
| - pandocfilters=1.5.0=pyhd8ed1ab_0 | |
| - parquet-cpp=1.5.1=2 | |
| - parso=0.8.3=pyhd8ed1ab_0 | |
| - patsy=0.5.3=pyhd8ed1ab_0 | |
| - pcre2=10.40=hc3806b6_0 | |
| - pexpect=4.8.0=pyh1a96a4e_2 | |
| - pickleshare=0.7.5=py_1003 | |
| - pillow=9.5.0=py311h573f0d3_0 | |
| - pip=23.1.1=pyhd8ed1ab_0 | |
| - pixman=0.40.0=h36c2ea0_0 | |
| - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 | |
| - platformdirs=3.2.0=pyhd8ed1ab_0 | |
| - pooch=1.7.0=pyha770c72_3 | |
| - prometheus_client=0.16.0=pyhd8ed1ab_0 | |
| - prompt-toolkit=3.0.38=pyha770c72_0 | |
| - prompt_toolkit=3.0.38=hd8ed1ab_0 | |
| - psutil=5.9.5=py311h2582759_0 | |
| - pthread-stubs=0.4=h36c2ea0_1001 | |
| - ptyprocess=0.7.0=pyhd3deb0d_0 | |
| - pure_eval=0.2.2=pyhd8ed1ab_0 | |
| - py4j=0.10.9.7=pyhd8ed1ab_0 | |
| - pyarrow=11.0.0=py311hbdf6286_14_cpu | |
| - pycparser=2.21=pyhd8ed1ab_0 | |
| - pygments=2.15.1=pyhd8ed1ab_0 | |
| - pyopenssl=23.1.1=pyhd8ed1ab_0 | |
| - pyparsing=3.0.9=pyhd8ed1ab_0 | |
| - pyrsistent=0.19.3=py311h2582759_0 | |
| - pysocks=1.7.1=pyha2e5f31_6 | |
| - pyspark=3.4.0=pyhd8ed1ab_0 | |
| - python=3.11.3=h2755cc3_0_cpython | |
| - python-dateutil=2.8.2=pyhd8ed1ab_0 | |
| - python-fastjsonschema=2.16.3=pyhd8ed1ab_0 | |
| - python-json-logger=2.0.7=pyhd8ed1ab_0 | |
| - python-tzdata=2023.3=pyhd8ed1ab_0 | |
| - python_abi=3.11=3_cp311 | |
| - pytz=2023.3=pyhd8ed1ab_0 | |
| - pyyaml=6.0=py311hd4cff14_5 | |
| - pyzmq=25.0.2=py311hd6ccaeb_0 | |
| - re2=2023.02.02=hcb278e6_0 | |
| - readline=8.2=h8228510_1 | |
| - requests=2.28.2=pyhd8ed1ab_1 | |
| - rfc3339-validator=0.1.4=pyhd8ed1ab_0 | |
| - rfc3986-validator=0.1.1=pyh9f0ad1d_0 | |
| - s2n=1.3.41=h3358134_0 | |
| - scipy=1.10.1=py311h8e6699e_0 | |
| - seaborn=0.12.2=hd8ed1ab_0 | |
| - seaborn-base=0.12.2=pyhd8ed1ab_0 | |
| - send2trash=1.8.0=pyhd8ed1ab_0 | |
| - setuptools=67.7.1=pyhd8ed1ab_0 | |
| - six=1.16.0=pyh6c4a22f_0 | |
| - snappy=1.1.10=h9fff704_0 | |
| - sniffio=1.3.0=pyhd8ed1ab_0 | |
| - soupsieve=2.3.2.post1=pyhd8ed1ab_0 | |
| - stack_data=0.6.2=pyhd8ed1ab_0 | |
| - statsmodels=0.13.5=py311h4c7f6c3_2 | |
| - terminado=0.17.1=pyh41d4057_0 | |
| - tinycss2=1.2.1=pyhd8ed1ab_0 | |
| - tk=8.6.12=h27826a3_0 | |
| - tomli=2.0.1=pyhd8ed1ab_0 | |
| - tornado=6.3=py311h2582759_0 | |
| - traitlets=5.9.0=pyhd8ed1ab_0 | |
| - typing-extensions=4.5.0=hd8ed1ab_0 | |
| - typing_extensions=4.5.0=pyha770c72_0 | |
| - tzdata=2023c=h71feb2d_0 | |
| - ucx=1.14.0=h8c404fb_1 | |
| - urllib3=1.26.15=pyhd8ed1ab_0 | |
| - wcwidth=0.2.6=pyhd8ed1ab_0 | |
| - webencodings=0.5.1=py_1 | |
| - websocket-client=1.5.1=pyhd8ed1ab_0 | |
| - wheel=0.40.0=pyhd8ed1ab_0 | |
| - xorg-fixesproto=5.0=h7f98852_1002 | |
| - xorg-inputproto=2.3.2=h7f98852_1002 | |
| - xorg-kbproto=1.0.7=h7f98852_1002 | |
| - xorg-libice=1.0.10=h7f98852_0 | |
| - xorg-libsm=1.2.3=hd9c2040_1000 | |
| - xorg-libx11=1.8.4=h0b41bf4_0 | |
| - xorg-libxau=1.0.9=h7f98852_0 | |
| - xorg-libxdmcp=1.1.3=h7f98852_0 | |
| - xorg-libxext=1.3.4=h0b41bf4_2 | |
| - xorg-libxfixes=5.0.3=h7f98852_1004 | |
| - xorg-libxi=1.7.10=h7f98852_0 | |
| - xorg-libxrender=0.9.10=h7f98852_1003 | |
| - xorg-libxtst=1.2.3=h7f98852_1002 | |
| - xorg-recordproto=1.14.2=h7f98852_1002 | |
| - xorg-renderproto=0.11.1=h7f98852_1002 | |
| - xorg-xextproto=7.3.0=h0b41bf4_1003 | |
| - xorg-xproto=7.0.31=h7f98852_1007 | |
| - xz=5.2.6=h166bdaf_0 | |
| - y-py=0.5.9=py311hfe55011_0 | |
| - yaml=0.2.5=h7f98852_2 | |
| - ypy-websocket=0.8.2=pyhd8ed1ab_0 | |
| - zeromq=4.3.4=h9c3ff4c_1 | |
| - zipp=3.15.0=pyhd8ed1ab_0 | |
| - zlib=1.2.13=h166bdaf_4 | |
| - zstd=1.5.2=h3eb15da_6 | |
| # prefix: /path/to/pyspark-intro |