junkor-1011 · April 24, 2023 15:21
diff --git a/.gitignore b/.gitignore
 testdata

 # Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
 # Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all

 ### JupyterNotebooks ###
 # gitignore template for Jupyter Notebooks
 # website: http://jupyter.org/

 .ipynb_checkpoints
 */.ipynb_checkpoints/*

 # IPython
 profile_default/
 ipython_config.py

 # Remove previous ipynb_checkpoints
 #   git rm -r .ipynb_checkpoints/

 ### PyCharm+all ###
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

 # User-specific stuff
 .idea/**/workspace.xml
 .idea/**/tasks.xml
 .idea/**/usage.statistics.xml
 .idea/**/dictionaries
 .idea/**/shelf

 # AWS User-specific
 .idea/**/aws.xml

 # Generated files
 .idea/**/contentModel.xml

 # Sensitive or high-churn files
 .idea/**/dataSources/
 .idea/**/dataSources.ids
 .idea/**/dataSources.local.xml
 .idea/**/sqlDataSources.xml
 .idea/**/dynamic.xml
 .idea/**/uiDesigner.xml
 .idea/**/dbnavigator.xml

 # Gradle
 .idea/**/gradle.xml
 .idea/**/libraries

 # Gradle and Maven with auto-import
 # When using Gradle or Maven with auto-import, you should exclude module files,
 # since they will be recreated, and may cause churn.  Uncomment if using
 # auto-import.
 # .idea/artifacts
 # .idea/compiler.xml
 # .idea/jarRepositories.xml
 # .idea/modules.xml
 # .idea/*.iml
 # .idea/modules
 # *.iml
 # *.ipr

 # CMake
 cmake-build-*/

 # Mongo Explorer plugin
 .idea/**/mongoSettings.xml

 # File-based project format
 *.iws

 # IntelliJ
 out/

 # mpeltonen/sbt-idea plugin
 .idea_modules/

 # JIRA plugin
 atlassian-ide-plugin.xml

 # Cursive Clojure plugin
 .idea/replstate.xml

 # SonarLint plugin
 .idea/sonarlint/

 # Crashlytics plugin (for Android Studio and IntelliJ)
 com_crashlytics_export_strings.xml
 crashlytics.properties
 crashlytics-build.properties
 fabric.properties

 # Editor-based Rest Client
 .idea/httpRequests

 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser

 ### PyCharm+all Patch ###
 # Ignore everything but code style settings and run configurations
 # that are supposed to be shared within teams.

 .idea/*

 !.idea/codeStyles
 !.idea/runConfigurations

 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # C extensions
 *.so

 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST

 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec

 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/

 # Translations
 *.mo
 *.pot

 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal

 # Flask stuff:
 instance/
 .webassets-cache

 # Scrapy stuff:
 .scrapy

 # Sphinx documentation
 docs/_build/

 # PyBuilder
 .pybuilder/
 target/

 # Jupyter Notebook

 # IPython

 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version

 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock

 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock

 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml

 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/

 # Celery stuff
 celerybeat-schedule
 celerybeat.pid

 # SageMath parsed files
 *.sage.py

 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/

 # Spyder project settings
 .spyderproject
 .spyproject

 # Rope project settings
 .ropeproject

 # mkdocs documentation
 /site

 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json

 # Pyre type checker
 .pyre/

 # pytype static type analyzer
 .pytype/

 # Cython debug symbols
 cython_debug/

 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

 ### Python Patch ###
 # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
 poetry.toml

 # ruff
 .ruff_cache/

 # LSP config files
 pyrightconfig.json

 ### VisualStudioCode ###
 .vscode/*
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 !.vscode/*.code-snippets

 # Local History for Visual Studio Code
 .history/

 # Built Visual Studio Code Extensions
 *.vsix

 ### VisualStudioCode Patch ###
 # Ignore all local history of files
 .history
 .ionide

 # End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
diff --git a/README.md b/README.md
diff --git a/pyspark-test.ipynb b/pyspark-test.ipynb
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e40569e7",
   "metadata": {},
   "source": [
    "# Pyspark入門\n",
    "\n",
    "pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、\n",
    "メモリに乗り切らないレベルのデータ量は処理できない。\n",
    "また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。\n",
    "\n",
    "遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、\n",
    "更にpandasとの連携・併用ができるツールとして[pyspark](https://www.databricks.com/jp/glossary/pyspark)が存在するため紹介する。\n",
    "\n",
    "なお、詳細は[PySpark Documentation](https://spark.apache.org/docs/latest/api/python/)なども参照のこと\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "c612301e",
   "metadata": {},
   "source": [
    "## 環境の作成\n",
    "\n",
    "Mambaforgeをインストール済みのLinux環境を前提とする。\n",
    "\n",
    "- conda 23.1.0\n",
    "- mamba 1.4.1\n",
    "\n",
    "以下のようなconda仮想環境作成用のyamlを用意する:\n",
    "\n",
    "```yaml:pyspark_env.yaml\n",
    "# pyspark_env.yaml\n",
    "name: pyspark-intro\n",
    "channels:\n",
    "  - conda-forge\n",
    "dependencies:\n",
    "  - python=3.11\n",
    "  - pyspark>=3.4\n",
    "  - openjdk=17\n",
    "  - pandas>=2.0.1\n",
    "  - pyarrow\n",
    "  # for testdata\n",
    "  - numpy\n",
    "  - seaborn\n",
    "  # jupyter\n",
    "  - jupyterlab\n",
    "```\n",
    "\n",
    "※~~先日[pandasの2.0がリリースされた](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html)が、\n",
    "deprecatedになっていた[`iteritems`](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#removal-of-prior-version-deprecations-changes)が削除された影響で、\n",
    "pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。~~\n",
    "`pyspark>=3.4.0`で修正済みなので、pandasも2系を使う\n",
    "\n",
    "\n",
    "mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)\n",
    "\n",
    "```sh\n",
    "mamba env create -f pyspark_env.yaml\n",
    "\n",
    "# 出来上がった環境(pyspark-intro)をactivate\n",
    "conda activate pyspark-intro\n",
    "# or `mamba activate pyspark-intro`\n",
    "```"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "4290e154",
   "metadata": {},
   "source": [
    "## 実行\n",
    "\n",
    "- spark sessionの起動\n",
    "- データの読み込みと保存\n",
    "- pandasとの連携\n",
    "\n",
    "あたりを簡単に確認する。\n",
    "\n",
    "データ処理周りの詳細は省略するが、Sparkのデータフレーム機能が[Spark SQL](https://www.databricks.com/jp/glossary/what-is-spark-sql)と対応していることから、\n",
    "概ねSQLで出来ることようなことを実行出来ると考えればOK。（というかSQLでも書ける）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "17452c65",
   "metadata": {},
   "outputs": [],
   "source": [
    "# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)\n",
    "\n",
    "!rm -rf testdata && mkdir testdata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "04ef6a3d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "your 131072x1 screen size is bogus. expect trouble\n",
      "23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)\n",
      "23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "            <div>\n",
       "                <p><b>SparkSession - in-memory</b></p>\n",
       "                \n",
       "        <div>\n",
       "            <p><b>SparkContext</b></p>\n",
       "\n",
       "            <p><a href=\"http://172.24.210.1:4040\">Spark UI</a></p>\n",
       "\n",
       "            <dl>\n",
       "              <dt>Version</dt>\n",
       "                <dd><code>v3.4.0</code></dd>\n",
       "              <dt>Master</dt>\n",
       "                <dd><code>local[*]</code></dd>\n",
       "              <dt>AppName</dt>\n",
       "                <dd><code>LocalTest</code></dd>\n",
       "            </dl>\n",
       "        </div>\n",
       "        \n",
       "            </div>\n",
       "        "
      ],
      "text/plain": [
       "<pyspark.sql.session.SparkSession at 0x7ff1d5736cd0>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.session import SparkSession\n",
    "\n",
    "# spark sessionの立ち上げ\n",
    "spark = (SparkSession.builder\n",
    "  .master(\"local[*]\")\n",
    "  .appName(\"LocalTest\")\n",
    "  .config(\"spark.sql.execution.arrow.pyspark.enabled\", \"true\")\n",
    "  .config(\"spark.executor.memory\", \"4g\") # TMP (実際はハードコートはあまり良くない)\n",
    "  .getOrCreate()\n",
    ")\n",
    "spark"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "71bf72b6",
   "metadata": {},
   "source": [
    "↑`config`部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)\n",
    "\n",
    "参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion\n",
    "\n",
    "なお、詳細かつ環境依存性のあるconfigは別途configファイル(`$SPARK_HOME/conf/spark-defaults.conf`)を作成するなどして設定すると良い\n",
    "\n",
    "コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html\n",
    "\n",
    "また、コンフィグファイルの置き場所の設定などに必要な環境変数`SPARK_HOME`について、condaでインストールした場合の設定方法などは[ここ](https://qiita.com/junkor-1011/items/7ec9bfaaf76568ce4a05#spark_home%E3%81%AE%E8%A8%AD%E5%AE%9A)などを参照"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "d6ac5ce7",
   "metadata": {},
   "source": [
    "Spark Sessionを起動すると、デフォルトでは[http://localhost:4040](http://localhost:4040)に[Spark UI](http://localhost:4040)が立ち上がり、\n",
    "実行状態などをモニタリングすることが出来る（デバッグなどにも便利）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "733b9529",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>6.7</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.2</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>6.3</td>\n",
       "      <td>2.5</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.9</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>6.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>6.2</td>\n",
       "      <td>3.4</td>\n",
       "      <td>5.4</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>5.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.1</td>\n",
       "      <td>1.8</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     sepal_length  sepal_width  petal_length  petal_width    species\n",
       "0             5.1          3.5           1.4          0.2     setosa\n",
       "1             4.9          3.0           1.4          0.2     setosa\n",
       "2             4.7          3.2           1.3          0.2     setosa\n",
       "3             4.6          3.1           1.5          0.2     setosa\n",
       "4             5.0          3.6           1.4          0.2     setosa\n",
       "..            ...          ...           ...          ...        ...\n",
       "145           6.7          3.0           5.2          2.3  virginica\n",
       "146           6.3          2.5           5.0          1.9  virginica\n",
       "147           6.5          3.0           5.2          2.0  virginica\n",
       "148           6.2          3.4           5.4          2.3  virginica\n",
       "149           5.9          3.0           5.1          1.8  virginica\n",
       "\n",
       "[150 rows x 5 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# テストデータの作成\n",
    "import seaborn as sns\n",
    "\n",
    "iris = sns.load_dataset(\"iris\")\n",
    "display(iris)\n",
    "\n",
    "iris.to_csv(\"testdata/iris.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "60b743f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# pysparkでcsvを読み込み\n",
    "\n",
    "df_iris = spark.read.csv(\n",
    "    \"testdata/iris.csv\",\n",
    "    header=True,\n",
    "    inferSchema=True,\n",
    ")\n",
    "\n",
    "# 遅延評価するので、カラムのスキーマのみ表示される\n",
    "display(df_iris)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ebf1496e",
   "metadata": {},
   "source": [
    "(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)\n",
    "\n",
    "\n",
    "※(補足):\n",
    "\n",
    "- 遅延評価を行う関係で、オンメモリに展開するpandasなどと異なりSparkの処理のパフォーマンスはデータソースの形式の影響をもろに受ける\n",
    "    - データ形式(csv/json/parquet/orc/RDBテーブル/...etc)や、圧縮形式(gzip/xz/snappy/zlib/...etc)、カラムに対するパーティションなど\n",
    "- データ分析・データ処理用途であれば行指向形式であるcsvを読み込んで処理を実行するのは一般に遅く、列指向形式のparquetやorcなどの方が有利"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e89db683",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+-------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# showなどによって初めて評価が行われる\n",
    "df_iris.show(5) # 5件を表示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3d4c8b6e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+\n",
      "|sepal_length|sepal_width|\n",
      "+------------+-----------+\n",
      "|         5.1|        3.5|\n",
      "|         4.9|        3.0|\n",
      "|         4.7|        3.2|\n",
      "+------------+-----------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# selectによるカラムの選択\n",
    "(df_iris\n",
    "    .select([\"sepal_length\", \"sepal_width\"])\n",
    "    .show(3)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0f5df129",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+---------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|  species|\n",
      "+------------+-----------+------------+-----------+---------+\n",
      "|         6.3|        3.3|         6.0|        2.5|virginica|\n",
      "|         5.8|        2.7|         5.1|        1.9|virginica|\n",
      "|         7.1|        3.0|         5.9|        2.1|virginica|\n",
      "+------------+-----------+------------+-----------+---------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# filterによる簡単なクエリ\n",
    "(df_iris\n",
    "    .filter(df_iris.species == \"virginica\")\n",
    "    .filter(df_iris.sepal_length > 5.0)\n",
    "    .show(3)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "77abd749",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+-----+\n",
      "|   species|count|\n",
      "+----------+-----+\n",
      "| virginica|   50|\n",
      "|versicolor|   50|\n",
      "|    setosa|   50|\n",
      "+----------+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# groupByによる集約(1)\n",
    "(df_iris\n",
    "    .groupBy('species')\n",
    "    .count()\n",
    "    .show()    \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a998da4d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------+------------------+------------------+-----------------+------------------+\n",
      "|   species|  avg(sepal_width)|  avg(petal_width)|avg(sepal_length)| avg(petal_length)|\n",
      "+----------+------------------+------------------+-----------------+------------------+\n",
      "| virginica|2.9739999999999998|             2.026|6.587999999999998|             5.552|\n",
      "|versicolor|2.7700000000000005|1.3259999999999998|            5.936|              4.26|\n",
      "|    setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|\n",
      "+----------+------------------+------------------+-----------------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# groupByによる集約(2)\n",
    "\n",
    "(df_iris\n",
    "    .groupBy('species')\n",
    "    .agg({\n",
    "        'sepal_length': 'mean',\n",
    "        'sepal_width': 'mean',\n",
    "        'petal_length': 'mean',\n",
    "        'petal_width': 'mean',\n",
    "    })\n",
    "    .show()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "30e590f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+-------------------+------------------+------------------+---------+\n",
      "|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|\n",
      "+-------+------------------+-------------------+------------------+------------------+---------+\n",
      "|  count|               150|                150|               150|               150|      150|\n",
      "|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     null|\n",
      "| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     null|\n",
      "|    min|               4.3|                2.0|               1.0|               0.1|   setosa|\n",
      "|    max|               7.9|                4.4|               6.9|               2.5|virginica|\n",
      "+-------+------------------+-------------------+------------------+------------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# DataFrameのサマリー\n",
    "df_iris.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b086ea18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------+---------+-----------+\n",
      "|namespace|tableName|isTemporary|\n",
      "+---------+---------+-----------+\n",
      "|         |     iris|       true|\n",
      "+---------+---------+-----------+\n",
      "\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "|         5.4|        3.9|         1.7|        0.4| setosa|\n",
      "|         4.6|        3.4|         1.4|        0.3| setosa|\n",
      "|         5.0|        3.4|         1.5|        0.2| setosa|\n",
      "|         4.4|        2.9|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.1|         1.5|        0.1| setosa|\n",
      "|         5.4|        3.7|         1.5|        0.2| setosa|\n",
      "|         4.8|        3.4|         1.6|        0.2| setosa|\n",
      "|         4.8|        3.0|         1.4|        0.1| setosa|\n",
      "|         4.3|        3.0|         1.1|        0.1| setosa|\n",
      "|         5.8|        4.0|         1.2|        0.2| setosa|\n",
      "|         5.7|        4.4|         1.5|        0.4| setosa|\n",
      "|         5.4|        3.9|         1.3|        0.4| setosa|\n",
      "|         5.1|        3.5|         1.4|        0.3| setosa|\n",
      "|         5.7|        3.8|         1.7|        0.3| setosa|\n",
      "|         5.1|        3.8|         1.5|        0.3| setosa|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# SQLでクエリを書くことも可能\n",
    "df_iris.createOrReplaceTempView('iris')\n",
    "\n",
    "spark.sql(\"show tables\").show()\n",
    "\n",
    "tmp = spark.sql(\"SELECT * FROM iris WHERE species = 'setosa'\")\n",
    "tmp.show()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "37f513cd",
   "metadata": {},
   "source": [
    "詳細は[User Guide](https://spark.apache.org/docs/3.3.2/api/python/user_guide/index.html), [API Reference](https://spark.apache.org/docs/3.3.2/api/python/reference/index.html),\n",
    "および[Spark SQLのリファレンス](https://spark.apache.org/docs/3.3.2/api/sql/index.html)を参照"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "2ffc3fa2",
   "metadata": {},
   "source": [
    "次に、データの[Read/Write](https://spark.apache.org/docs/latest/sql-data-sources.html)を確認する"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e2ee6092",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+-------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "|         5.4|        3.9|         1.7|        0.4| setosa|\n",
      "|         4.6|        3.4|         1.4|        0.3| setosa|\n",
      "|         5.0|        3.4|         1.5|        0.2| setosa|\n",
      "|         4.4|        2.9|         1.4|        0.2| setosa|\n",
      "|         4.9|        3.1|         1.5|        0.1| setosa|\n",
      "|         5.4|        3.7|         1.5|        0.2| setosa|\n",
      "|         4.8|        3.4|         1.6|        0.2| setosa|\n",
      "|         4.8|        3.0|         1.4|        0.1| setosa|\n",
      "|         4.3|        3.0|         1.1|        0.1| setosa|\n",
      "|         5.8|        4.0|         1.2|        0.2| setosa|\n",
      "|         5.7|        4.4|         1.5|        0.4| setosa|\n",
      "|         5.4|        3.9|         1.3|        0.4| setosa|\n",
      "|         5.1|        3.5|         1.4|        0.3| setosa|\n",
      "|         5.7|        3.8|         1.7|        0.3| setosa|\n",
      "|         5.1|        3.8|         1.5|        0.3| setosa|\n",
      "+------------+-----------+------------+-----------+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する\n",
    "\n",
    "display(iris.__class__) # irisはpandas dataframe\n",
    "df = spark.createDataFrame(iris)\n",
    "\n",
    "display(df)\n",
    "\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0dc198ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.dataframe.DataFrame"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>6.7</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.2</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>6.3</td>\n",
       "      <td>2.5</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.9</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>6.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>6.2</td>\n",
       "      <td>3.4</td>\n",
       "      <td>5.4</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>5.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.1</td>\n",
       "      <td>1.8</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     sepal_length  sepal_width  petal_length  petal_width    species\n",
       "0             5.1          3.5           1.4          0.2     setosa\n",
       "1             4.9          3.0           1.4          0.2     setosa\n",
       "2             4.7          3.2           1.3          0.2     setosa\n",
       "3             4.6          3.1           1.5          0.2     setosa\n",
       "4             5.0          3.6           1.4          0.2     setosa\n",
       "..            ...          ...           ...          ...        ...\n",
       "145           6.7          3.0           5.2          2.3  virginica\n",
       "146           6.3          2.5           5.0          1.9  virginica\n",
       "147           6.5          3.0           5.2          2.0  virginica\n",
       "148           6.2          3.4           5.4          2.3  virginica\n",
       "149           5.9          3.0           5.1          1.8  virginica\n",
       "\n",
       "[150 rows x 5 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# spark dataframeからpandas dataframeへの変換\n",
    "\n",
    "display(df.__class__) # pyspark.sql.dataframe.DataFrame\n",
    "\n",
    "pdf = df.toPandas()\n",
    "display(pdf)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ee45272f",
   "metadata": {},
   "source": [
    "↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったら`toPandas`をして扱いやすいpandasに変換する、という使い方が出来る。"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "9d02c4e4",
   "metadata": {},
   "source": [
    "最後に色々な形式でデータの書き込みと読み込みを行う\n",
    "\n",
    "先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "661fc7b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_SUCCESS\n",
      "part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n",
      "part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv\n"
     ]
    }
   ],
   "source": [
    "# dfはspark dataframe\n",
    "\n",
    "# csv.gz\n",
    "df.write.save(\"testdata/iris_csv\", format=\"csv\")\n",
    "\n",
    "# formatオプションを使う代わりに、↓のように書いてもOK\n",
    "# df.write.csv(\"testdata/iris_csv\", compression=\"gzip\")\n",
    "\n",
    "!ls testdata/iris_csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a0508de1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.1,3.5,1.4,0.2,setosa\n",
      "4.9,3.0,1.4,0.2,setosa\n",
      "4.7,3.2,1.3,0.2,setosa\n",
      "4.6,3.1,1.5,0.2,setosa\n",
      "5.0,3.6,1.4,0.2,setosa\n",
      "5.4,3.9,1.7,0.4,setosa\n",
      "4.6,3.4,1.4,0.3,setosa\n",
      "5.0,3.4,1.5,0.2,setosa\n",
      "4.4,2.9,1.4,0.2,setosa\n",
      "4.9,3.1,1.5,0.1,setosa\n"
     ]
    }
   ],
   "source": [
    "!head testdata/iris_csv/part-00000-*.csv"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "5a2cbc62",
   "metadata": {},
   "source": [
    "↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。\n",
    "(ファイルが分散して生成される)\n",
    "\n",
    "読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、\n",
    "**基本は保存したディレクトリを指定して読み込む**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5953ef8e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+---+---+----------+\n",
      "|_c0|_c1|_c2|_c3|       _c4|\n",
      "+---+---+---+---+----------+\n",
      "|6.3|3.3|4.7|1.6|versicolor|\n",
      "|4.9|2.4|3.3|1.0|versicolor|\n",
      "|6.6|2.9|4.6|1.3|versicolor|\n",
      "+---+---+---+---+----------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_csv = spark.read.load(\"testdata/iris_csv\", format=\"csv\")\n",
    "\n",
    "# formatを指定せず、↓のようにすることも可能\n",
    "# df_csv = spark.read.csv(\"testdata/iris_csv\")\n",
    "\n",
    "df_csv.show(3)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f7c66c47",
   "metadata": {},
   "source": [
    "(↑csvだとカラム名などの情報が欠落しがちで使いづらい)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "65039664",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_SUCCESS\n",
      "part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n",
      "part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz\n"
     ]
    }
   ],
   "source": [
    "# テキスト形式だとcsvの他にjsonなども指定可能\n",
    "# compressionで圧縮形式を指定することも出来る\n",
    "\n",
    "df.write.json(\"testdata/iris_json\", compression='gzip')\n",
    "\n",
    "!ls testdata/iris_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "6245160d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+----------+\n",
      "|petal_length|petal_width|sepal_length|sepal_width|   species|\n",
      "+------------+-----------+------------+-----------+----------+\n",
      "|         3.3|        1.0|         5.0|        2.3|versicolor|\n",
      "|         4.2|        1.3|         5.6|        2.7|versicolor|\n",
      "|         4.2|        1.2|         5.7|        3.0|versicolor|\n",
      "+------------+-----------+------------+-----------+----------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 読み込みも同様\n",
    "\n",
    "df_json = spark.read.json(\"testdata/iris_json\")\n",
    "# or df_json = spark.read.load(\"testdata/iris_json\", format=\"json\")\n",
    "\n",
    "display(df_json)\n",
    "df_json.show(3)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "04a92179",
   "metadata": {},
   "source": [
    "(↑jsonだとcsvよりは型情報が保持される)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "3151765f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory\n",
      "Scaling row group sizes to 95.00% for 8 writers\n",
      "                                                                                \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_SUCCESS\n",
      "part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet\n",
      "_SUCCESS\n",
      "part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n",
      "part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc\n"
     ]
    }
   ],
   "source": [
    "# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など\n",
    "\n",
    "df.write.parquet(\"testdata/iris_parquet\", compression=\"snappy\")\n",
    "!ls testdata/iris_parquet\n",
    "\n",
    "df.write.orc(\"testdata/iris_orc\", compression=\"zlib\")\n",
    "!ls testdata/iris_orc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "4cb05940",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "parquet:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+----------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|   species|\n",
      "+------------+-----------+------------+-----------+----------+\n",
      "|         5.0|        2.3|         3.3|        1.0|versicolor|\n",
      "|         5.6|        2.7|         4.2|        1.3|versicolor|\n",
      "|         5.7|        3.0|         4.2|        1.2|versicolor|\n",
      "+------------+-----------+------------+-----------+----------+\n",
      "only showing top 3 rows\n",
      "\n",
      "\n",
      "------------------\n",
      "\n",
      "orc:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+----------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|   species|\n",
      "+------------+-----------+------------+-----------+----------+\n",
      "|         5.0|        2.3|         3.3|        1.0|versicolor|\n",
      "|         5.6|        2.7|         4.2|        1.3|versicolor|\n",
      "|         5.7|        3.0|         4.2|        1.2|versicolor|\n",
      "+------------+-----------+------------+-----------+----------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 読み込み\n",
    "\n",
    "# parquet\n",
    "print(\"parquet:\")\n",
    "df_parquet = spark.read.parquet(\"testdata/iris_parquet\")\n",
    "display(df_parquet)\n",
    "df_parquet.show(3)\n",
    "\n",
    "print(\"\\n------------------\\n\")\n",
    "\n",
    "# orc\n",
    "print(\"orc:\")\n",
    "df_orc = spark.read.orc(\"testdata/iris_orc\")\n",
    "display(df_orc)\n",
    "df_orc.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e2c0ff8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[00;38;5;33m.\u001b[0m\n",
      "├── _SUCCESS\n",
      "├── \u001b[00;38;5;33mspecies=setosa\u001b[0m\n",
      "│   ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "│   ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "│   └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "├── \u001b[00;38;5;33mspecies=versicolor\u001b[0m\n",
      "│   ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "│   ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "│   ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "│   └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "└── \u001b[00;38;5;33mspecies=virginica\u001b[0m\n",
      "    ├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "    ├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "    └── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet\n",
      "\n",
      "4 directories, 11 files\n"
     ]
    }
   ],
   "source": [
    "# partitionを切って保存することも可能\n",
    "\n",
    "df.write.save(\n",
    "    \"testdata/iris_with_partition\",\n",
    "    format=\"parquet\",\n",
    "    compression=\"snappy\",\n",
    "    partitionBy=\"species\"\n",
    "    )\n",
    "\n",
    "!cd testdata/iris_with_partition && tree"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "51fcefd8",
   "metadata": {},
   "source": [
    "カラム\"species\"の値ごとに別ディレクトリが切られてデータが保存される\n",
    "\n",
    "RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る（し、不適切だと悪化する）\n",
    "\n",
    "↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。\n",
    "一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "4a458cb4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+-----------+------------+-----------+---------+\n",
      "|sepal_length|sepal_width|petal_length|petal_width|  species|\n",
      "+------------+-----------+------------+-----------+---------+\n",
      "|         7.9|        3.8|         6.4|        2.0|virginica|\n",
      "|         6.4|        2.8|         5.6|        2.2|virginica|\n",
      "|         6.3|        2.8|         5.1|        1.5|virginica|\n",
      "+------------+-----------+------------+-----------+---------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# パーティションがある場合でも読み込み方法は同様\n",
    "\n",
    "df_partition = spark.read.load(\"testdata/iris_with_partition\")\n",
    "\n",
    "display(df_partition)\n",
    "df_partition.show(3)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "376cf4e6",
   "metadata": {},
   "source": [
    "他にも[saveAsTable](https://spark.apache.org/docs/3.2.0/api/python/reference/api/pyspark.sql.DataFrameWriter.saveAsTable.html)で永続化したテーブルとして書き込みを行ったりも出来る。\n",
    "\n",
    "扱えるデータ保存形式も[Data Sources](https://spark.apache.org/docs/latest/sql-data-sources.html)のように、\n",
    "他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "77eb272b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import pyspark.pandas as ps"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "7990e97a",
   "metadata": {},
   "source": [
    "## (補足)pysparkにおけるpandas API\n",
    "\n",
    "参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html\n",
    "\n",
    "最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子\n",
    "\n",
    "なお、[Koalas](https://github.com/databricks/koalas)という、sparkをpandasっぽく使えるツールが存在するが、\n",
    "[ここ](https://learn.microsoft.com/ja-jp/azure/databricks/archive/legacy/koalas)などを見るとこれが正式にpysparkに取り込まれた？様子。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "27553982",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.\n",
      "  warnings.warn(message, PandasAPIOnSparkAdviceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "pyspark.pandas.frame.DataFrame"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width species\n",
       "0           5.1          3.5           1.4          0.2  setosa\n",
       "1           4.9          3.0           1.4          0.2  setosa\n",
       "2           4.7          3.2           1.3          0.2  setosa\n",
       "3           4.6          3.1           1.5          0.2  setosa\n",
       "4           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pyspark.pandas as ps\n",
    "\n",
    "# pandas on spark DataFrameを作成する\n",
    "psdf = ps.read_csv(\"testdata/iris.csv\")\n",
    "\n",
    "display(psdf.__class__) # pyspark.pandas.frame.DataFrame\n",
    "\n",
    "psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5dda2325",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>7.6</td>\n",
       "      <td>3.0</td>\n",
       "      <td>6.6</td>\n",
       "      <td>2.1</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>7.7</td>\n",
       "      <td>3.8</td>\n",
       "      <td>6.7</td>\n",
       "      <td>2.2</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>7.7</td>\n",
       "      <td>2.6</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>122</th>\n",
       "      <td>7.7</td>\n",
       "      <td>2.8</td>\n",
       "      <td>6.7</td>\n",
       "      <td>2.0</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>7.9</td>\n",
       "      <td>3.8</td>\n",
       "      <td>6.4</td>\n",
       "      <td>2.0</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>7.7</td>\n",
       "      <td>3.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>2.3</td>\n",
       "      <td>virginica</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     sepal_length  sepal_width  petal_length  petal_width    species\n",
       "105           7.6          3.0           6.6          2.1  virginica\n",
       "117           7.7          3.8           6.7          2.2  virginica\n",
       "118           7.7          2.6           6.9          2.3  virginica\n",
       "122           7.7          2.8           6.7          2.0  virginica\n",
       "131           7.9          3.8           6.4          2.0  virginica\n",
       "135           7.7          3.0           6.1          2.3  virginica"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# filterや表示も通常のpandasのようにできている\n",
    "psdf[psdf['sepal_length'] > 7.5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "a33ed92b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+------------+-----------+------------+-----------+-------+\n",
      "|index|sepal_length|sepal_width|petal_length|petal_width|species|\n",
      "+-----+------------+-----------+------------+-----------+-------+\n",
      "|    0|         5.1|        3.5|         1.4|        0.2| setosa|\n",
      "|    1|         4.9|        3.0|         1.4|        0.2| setosa|\n",
      "|    2|         4.7|        3.2|         1.3|        0.2| setosa|\n",
      "|    3|         4.6|        3.1|         1.5|        0.2| setosa|\n",
      "|    4|         5.0|        3.6|         1.4|        0.2| setosa|\n",
      "+-----+------------+-----------+------------+-----------+-------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# pysparkのDataFrameに変換\n",
    "\n",
    "sdf = psdf.to_spark(index_col=[\"index\"])\n",
    "\n",
    "sdf.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "69a33023",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.\n",
      "  warnings.warn(message, PandasAPIOnSparkAdviceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "pandas.core.frame.DataFrame"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width species\n",
       "0           5.1          3.5           1.4          0.2  setosa\n",
       "1           4.9          3.0           1.4          0.2  setosa\n",
       "2           4.7          3.2           1.3          0.2  setosa\n",
       "3           4.6          3.1           1.5          0.2  setosa\n",
       "4           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 普通のpandasに変換\n",
    "pdf = psdf.to_pandas()\n",
    "\n",
    "display(pdf.__class__) # pandas.core.frame.DataFrame\n",
    "\n",
    "pdf.head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "cfb09aa9",
   "metadata": {},
   "source": [
    "## (参考) [dask](https://www.dask.org/)\n",
    "\n",
    "pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。\n",
    "\n",
    "例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、\n",
    "データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。\n",
    "\n",
    "pysparkはdaskに比べると、\n",
    "\n",
    "- クラスターなどを組める分、より本格的なビッグデータにも対応出来る\n",
    "    - (今回紹介するようなやり方で、ライトに使うことも可能)\n",
    "- Javaをバックエンドに動くことに加え、本格的に動かす場合はクラスター構築などが必要になるため、そのレベルでやると動作環境の構築のハードルは高い\n",
    "- pandas、numpyとの親和性はdaskの方が~~高い~~高かった\n",
    "    - daskは遅延評価される以外は同じメソッドをそのまま使えるケースが多い\n",
    "    - sparkはむしろSQLと互換性がある\n",
    "    - ただ、先述の通りpandas APIが整備されつつあるので、pysparkもある程度pandasの使い方そのままで使えるようになっている模様"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/pyspark-test.md b/pyspark-test.md
diff --git a/pyspark_env.yaml b/pyspark_env.yaml
 name: pyspark-intro
 channels:
  - conda-forge
 dependencies:
  - python=3.11
  - pyspark>=3.4
  - openjdk=17
  - pandas>=2.0.1
  - pyarrow
  # for testdata
  - numpy
  - seaborn
  # jupyter
  - jupyterlab
diff --git a/pyspark_env_freeze.yaml b/pyspark_env_freeze.yaml
 name: pyspark-intro
 channels:
  - conda-forge
 dependencies:
  - _libgcc_mutex=0.1=conda_forge
  - _openmp_mutex=4.5=2_gnu
  - aiofiles=22.1.0=pyhd8ed1ab_0
  - aiosqlite=0.18.0=pyhd8ed1ab_0
  - alsa-lib=1.2.8=h166bdaf_0
  - anyio=3.6.2=pyhd8ed1ab_0
  - argon2-cffi=21.3.0=pyhd8ed1ab_0
  - argon2-cffi-bindings=21.2.0=py311hd4cff14_3
  - arrow-cpp=11.0.0=ha770c72_14_cpu
  - asttokens=2.2.1=pyhd8ed1ab_0
  - attrs=22.2.0=pyh71513ae_0
  - aws-c-auth=0.6.26=hf365957_1
  - aws-c-cal=0.5.21=h48707d8_2
  - aws-c-common=0.8.14=h0b41bf4_0
  - aws-c-compression=0.2.16=h03acc5a_5
  - aws-c-event-stream=0.2.20=h00877a2_4
  - aws-c-http=0.7.6=hf342b9f_0
  - aws-c-io=0.13.19=h5b20300_3
  - aws-c-mqtt=0.8.6=hc4349f7_12
  - aws-c-s3=0.2.7=h909e904_1
  - aws-c-sdkutils=0.1.8=h03acc5a_0
  - aws-checksums=0.1.14=h03acc5a_5
  - aws-crt-cpp=0.19.8=hf7fbfca_12
  - aws-sdk-cpp=1.10.57=h17c43bd_8
  - babel=2.12.1=pyhd8ed1ab_1
  - backcall=0.2.0=pyh9f0ad1d_0
  - backports=1.0=pyhd8ed1ab_3
  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
  - beautifulsoup4=4.12.2=pyha770c72_0
  - bleach=6.0.0=pyhd8ed1ab_0
  - brotli=1.0.9=h166bdaf_8
  - brotli-bin=1.0.9=h166bdaf_8
  - brotlipy=0.7.0=py311hd4cff14_1005
  - bzip2=1.0.8=h7f98852_4
  - c-ares=1.18.1=h7f98852_0
  - ca-certificates=2022.12.7=ha878542_0
  - cairo=1.16.0=h35add3b_1015
  - certifi=2022.12.7=pyhd8ed1ab_0
  - cffi=1.15.1=py311h409f033_3
  - charset-normalizer=3.1.0=pyhd8ed1ab_0
  - comm=0.1.3=pyhd8ed1ab_0
  - contourpy=1.0.7=py311ha3edf6b_0
  - cryptography=40.0.2=py311h9b4c7bb_0
  - cycler=0.11.0=pyhd8ed1ab_0
  - debugpy=1.6.7=py311hcafe171_0
  - decorator=5.1.1=pyhd8ed1ab_0
  - defusedxml=0.7.1=pyhd8ed1ab_0
  - entrypoints=0.4=pyhd8ed1ab_0
  - executing=1.2.0=pyhd8ed1ab_0
  - expat=2.5.0=hcb278e6_1
  - flit-core=3.8.0=pyhd8ed1ab_0
  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
  - font-ttf-inconsolata=3.000=h77eed37_0
  - font-ttf-source-code-pro=2.038=h77eed37_0
  - font-ttf-ubuntu=0.83=hab24e00_0
  - fontconfig=2.14.2=h14ed4e7_0
  - fonts-conda-ecosystem=1=0
  - fonts-conda-forge=1=0
  - fonttools=4.39.3=py311h2582759_0
  - freetype=2.12.1=hca18f0e_1
  - gettext=0.21.1=h27087fc_0
  - gflags=2.2.2=he1b5a44_1004
  - giflib=5.2.1=h0b41bf4_3
  - glog=0.6.0=h6f12383_0
  - graphite2=1.3.13=h58526e2_1001
  - harfbuzz=6.0.0=h3ff4399_1
  - icu=72.1=hcb278e6_0
  - idna=3.4=pyhd8ed1ab_0
  - importlib-metadata=6.6.0=pyha770c72_0
  - importlib_metadata=6.6.0=hd8ed1ab_0
  - importlib_resources=5.12.0=pyhd8ed1ab_0
  - ipykernel=6.22.0=pyh210e3f2_0
  - ipython=8.12.0=pyh41d4057_0
  - ipython_genutils=0.2.0=py_1
  - jedi=0.18.2=pyhd8ed1ab_0
  - jinja2=3.1.2=pyhd8ed1ab_1
  - json5=0.9.5=pyh9f0ad1d_0
  - jsonschema=4.17.3=pyhd8ed1ab_0
  - jupyter_client=8.2.0=pyhd8ed1ab_0
  - jupyter_core=5.3.0=py311h38be061_0
  - jupyter_events=0.6.3=pyhd8ed1ab_0
  - jupyter_server=2.5.0=pyhd8ed1ab_0
  - jupyter_server_fileid=0.9.0=pyhd8ed1ab_0
  - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1
  - jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0
  - jupyter_ydoc=0.2.3=pyhd8ed1ab_0
  - jupyterlab=3.6.3=pyhd8ed1ab_0
  - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
  - jupyterlab_server=2.22.1=pyhd8ed1ab_0
  - keyutils=1.6.1=h166bdaf_0
  - kiwisolver=1.4.4=py311h4dd048b_1
  - krb5=1.20.1=h81ceb04_0
  - lcms2=2.15=haa2dc70_1
  - ld_impl_linux-64=2.40=h41732ed_0
  - lerc=4.0.0=h27087fc_0
  - libabseil=20230125.0=cxx17_hcb278e6_1
  - libarrow=11.0.0=h93537a5_14_cpu
  - libblas=3.9.0=16_linux64_openblas
  - libbrotlicommon=1.0.9=h166bdaf_8
  - libbrotlidec=1.0.9=h166bdaf_8
  - libbrotlienc=1.0.9=h166bdaf_8
  - libcblas=3.9.0=16_linux64_openblas
  - libcrc32c=1.1.2=h9c3ff4c_0
  - libcups=2.3.3=h36d4200_3
  - libcurl=8.0.1=h588be90_0
  - libdeflate=1.18=h0b41bf4_0
  - libedit=3.1.20191231=he28a2e2_2
  - libev=4.33=h516909a_1
  - libevent=2.1.10=h28343ad_4
  - libexpat=2.5.0=hcb278e6_1
  - libffi=3.4.2=h7f98852_5
  - libgcc-ng=12.2.0=h65d4601_19
  - libgfortran-ng=12.2.0=h69a702a_19
  - libgfortran5=12.2.0=h337968e_19
  - libglib=2.76.2=hebfc3b9_0
  - libgomp=12.2.0=h65d4601_19
  - libgoogle-cloud=2.8.0=h0bc5f78_1
  - libgrpc=1.52.1=hcf146ea_1
  - libiconv=1.17=h166bdaf_0
  - libjpeg-turbo=2.1.5.1=h0b41bf4_0
  - liblapack=3.9.0=16_linux64_openblas
  - libnghttp2=1.52.0=h61bc06f_0
  - libnsl=2.0.0=h7f98852_0
  - libnuma=2.0.16=h0b41bf4_1
  - libopenblas=0.3.21=pthreads_h78a6416_3
  - libpng=1.6.39=h753d276_0
  - libprotobuf=3.21.12=h3eb15da_0
  - libsodium=1.0.18=h36c2ea0_1
  - libsqlite=3.40.0=h753d276_1
  - libssh2=1.10.0=hf14f497_3
  - libstdcxx-ng=12.2.0=h46fd767_19
  - libthrift=0.18.1=h5e4af38_0
  - libtiff=4.5.0=ha587672_6
  - libutf8proc=2.8.0=h166bdaf_0
  - libuuid=2.38.1=h0b41bf4_0
  - libwebp-base=1.3.0=h0b41bf4_0
  - libxcb=1.13=h7f98852_1004
  - libzlib=1.2.13=h166bdaf_4
  - lz4-c=1.9.4=hcb278e6_0
  - markupsafe=2.1.2=py311h2582759_0
  - matplotlib-base=3.7.1=py311h8597a09_0
  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
  - mistune=2.0.5=pyhd8ed1ab_0
  - munkres=1.1.4=pyh9f0ad1d_0
  - nbclassic=0.5.5=pyhb4ecaf3_1
  - nbclient=0.7.3=pyhd8ed1ab_0
  - nbconvert=7.3.1=pyhd8ed1ab_0
  - nbconvert-core=7.3.1=pyhd8ed1ab_0
  - nbconvert-pandoc=7.3.1=pyhd8ed1ab_0
  - nbformat=5.8.0=pyhd8ed1ab_0
  - ncurses=6.3=h27087fc_1
  - nest-asyncio=1.5.6=pyhd8ed1ab_0
  - notebook=6.5.4=pyha770c72_0
  - notebook-shim=0.2.2=pyhd8ed1ab_0
  - numpy=1.24.3=py311h64a7726_0
  - openjdk=17.0.3=h4335b31_6
  - openjpeg=2.5.0=hfec8fc6_2
  - openssl=3.1.0=hd590300_1
  - orc=1.8.3=hfdbbad2_0
  - packaging=23.1=pyhd8ed1ab_0
  - pandas=2.0.1=py311h320fe9a_0
  - pandoc=2.19.2=h32600fe_2
  - pandocfilters=1.5.0=pyhd8ed1ab_0
  - parquet-cpp=1.5.1=2
  - parso=0.8.3=pyhd8ed1ab_0
  - patsy=0.5.3=pyhd8ed1ab_0
  - pcre2=10.40=hc3806b6_0
  - pexpect=4.8.0=pyh1a96a4e_2
  - pickleshare=0.7.5=py_1003
  - pillow=9.5.0=py311h573f0d3_0
  - pip=23.1.1=pyhd8ed1ab_0
  - pixman=0.40.0=h36c2ea0_0
  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
  - platformdirs=3.2.0=pyhd8ed1ab_0
  - pooch=1.7.0=pyha770c72_3
  - prometheus_client=0.16.0=pyhd8ed1ab_0
  - prompt-toolkit=3.0.38=pyha770c72_0
  - prompt_toolkit=3.0.38=hd8ed1ab_0
  - psutil=5.9.5=py311h2582759_0
  - pthread-stubs=0.4=h36c2ea0_1001
  - ptyprocess=0.7.0=pyhd3deb0d_0
  - pure_eval=0.2.2=pyhd8ed1ab_0
  - py4j=0.10.9.7=pyhd8ed1ab_0
  - pyarrow=11.0.0=py311hbdf6286_14_cpu
  - pycparser=2.21=pyhd8ed1ab_0
  - pygments=2.15.1=pyhd8ed1ab_0
  - pyopenssl=23.1.1=pyhd8ed1ab_0
  - pyparsing=3.0.9=pyhd8ed1ab_0
  - pyrsistent=0.19.3=py311h2582759_0
  - pysocks=1.7.1=pyha2e5f31_6
  - pyspark=3.4.0=pyhd8ed1ab_0
  - python=3.11.3=h2755cc3_0_cpython
  - python-dateutil=2.8.2=pyhd8ed1ab_0
  - python-fastjsonschema=2.16.3=pyhd8ed1ab_0
  - python-json-logger=2.0.7=pyhd8ed1ab_0
  - python-tzdata=2023.3=pyhd8ed1ab_0
  - python_abi=3.11=3_cp311
  - pytz=2023.3=pyhd8ed1ab_0
  - pyyaml=6.0=py311hd4cff14_5
  - pyzmq=25.0.2=py311hd6ccaeb_0
  - re2=2023.02.02=hcb278e6_0
  - readline=8.2=h8228510_1
  - requests=2.28.2=pyhd8ed1ab_1
  - rfc3339-validator=0.1.4=pyhd8ed1ab_0
  - rfc3986-validator=0.1.1=pyh9f0ad1d_0
  - s2n=1.3.41=h3358134_0
  - scipy=1.10.1=py311h8e6699e_0
  - seaborn=0.12.2=hd8ed1ab_0
  - seaborn-base=0.12.2=pyhd8ed1ab_0
  - send2trash=1.8.0=pyhd8ed1ab_0
  - setuptools=67.7.1=pyhd8ed1ab_0
  - six=1.16.0=pyh6c4a22f_0
  - snappy=1.1.10=h9fff704_0
  - sniffio=1.3.0=pyhd8ed1ab_0
  - soupsieve=2.3.2.post1=pyhd8ed1ab_0
  - stack_data=0.6.2=pyhd8ed1ab_0
  - statsmodels=0.13.5=py311h4c7f6c3_2
  - terminado=0.17.1=pyh41d4057_0
  - tinycss2=1.2.1=pyhd8ed1ab_0
  - tk=8.6.12=h27826a3_0
  - tomli=2.0.1=pyhd8ed1ab_0
  - tornado=6.3=py311h2582759_0
  - traitlets=5.9.0=pyhd8ed1ab_0
  - typing-extensions=4.5.0=hd8ed1ab_0
  - typing_extensions=4.5.0=pyha770c72_0
  - tzdata=2023c=h71feb2d_0
  - ucx=1.14.0=h8c404fb_1
  - urllib3=1.26.15=pyhd8ed1ab_0
  - wcwidth=0.2.6=pyhd8ed1ab_0
  - webencodings=0.5.1=py_1
  - websocket-client=1.5.1=pyhd8ed1ab_0
  - wheel=0.40.0=pyhd8ed1ab_0
  - xorg-fixesproto=5.0=h7f98852_1002
  - xorg-inputproto=2.3.2=h7f98852_1002
  - xorg-kbproto=1.0.7=h7f98852_1002
  - xorg-libice=1.0.10=h7f98852_0
  - xorg-libsm=1.2.3=hd9c2040_1000
  - xorg-libx11=1.8.4=h0b41bf4_0
  - xorg-libxau=1.0.9=h7f98852_0
  - xorg-libxdmcp=1.1.3=h7f98852_0
  - xorg-libxext=1.3.4=h0b41bf4_2
  - xorg-libxfixes=5.0.3=h7f98852_1004
  - xorg-libxi=1.7.10=h7f98852_0
  - xorg-libxrender=0.9.10=h7f98852_1003
  - xorg-libxtst=1.2.3=h7f98852_1002
  - xorg-recordproto=1.14.2=h7f98852_1002
  - xorg-renderproto=0.11.1=h7f98852_1002
  - xorg-xextproto=7.3.0=h0b41bf4_1003
  - xorg-xproto=7.0.31=h7f98852_1007
  - xz=5.2.6=h166bdaf_0
  - y-py=0.5.9=py311hfe55011_0
  - yaml=0.2.5=h7f98852_2
  - ypy-websocket=0.8.2=pyhd8ed1ab_0
  - zeromq=4.3.4=h9c3ff4c_1
  - zipp=3.15.0=pyhd8ed1ab_0
  - zlib=1.2.13=h166bdaf_4
  - zstd=1.5.2=h3eb15da_6
 # prefix: /path/to/pyspark-intro
	testdata

	# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
	# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all

	### JupyterNotebooks ###
	# gitignore template for Jupyter Notebooks
	# website: http://jupyter.org/

	.ipynb_checkpoints
	/.ipynb_checkpoints/

	# IPython
	profile_default/
	ipython_config.py

	# Remove previous ipynb_checkpoints
	# git rm -r .ipynb_checkpoints/

	### PyCharm+all ###
	# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
	# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

	# User-specific stuff
	.idea/**/workspace.xml
	.idea/**/tasks.xml
	.idea/**/usage.statistics.xml
	.idea/**/dictionaries
	.idea/**/shelf

	# AWS User-specific
	.idea/**/aws.xml

	# Generated files
	.idea/**/contentModel.xml

	# Sensitive or high-churn files
	.idea/**/dataSources/
	.idea/**/dataSources.ids
	.idea/**/dataSources.local.xml
	.idea/**/sqlDataSources.xml
	.idea/**/dynamic.xml
	.idea/**/uiDesigner.xml
	.idea/**/dbnavigator.xml

	# Gradle
	.idea/**/gradle.xml
	.idea/**/libraries

	# Gradle and Maven with auto-import
	# When using Gradle or Maven with auto-import, you should exclude module files,
	# since they will be recreated, and may cause churn. Uncomment if using
	# auto-import.
	# .idea/artifacts
	# .idea/compiler.xml
	# .idea/jarRepositories.xml
	# .idea/modules.xml
	# .idea/*.iml
	# .idea/modules
	# *.iml
	# *.ipr

	# CMake
	cmake-build-*/

	# Mongo Explorer plugin
	.idea/**/mongoSettings.xml

	# File-based project format
	*.iws

	# IntelliJ
	out/

	# mpeltonen/sbt-idea plugin
	.idea_modules/

	# JIRA plugin
	atlassian-ide-plugin.xml

	# Cursive Clojure plugin
	.idea/replstate.xml

	# SonarLint plugin
	.idea/sonarlint/

	# Crashlytics plugin (for Android Studio and IntelliJ)
	com_crashlytics_export_strings.xml
	crashlytics.properties
	crashlytics-build.properties
	fabric.properties

	# Editor-based Rest Client
	.idea/httpRequests

	# Android studio 3.1+ serialized cache file
	.idea/caches/build_file_checksums.ser

	### PyCharm+all Patch ###
	# Ignore everything but code style settings and run configurations
	# that are supposed to be shared within teams.

	.idea/*

	!.idea/codeStyles
	!.idea/runConfigurations

	### Python ###
	# Byte-compiled / optimized / DLL files
	__pycache__/
	*.py[cod]
	*$py.class

	# C extensions
	*.so

	# Distribution / packaging
	.Python
	build/
	develop-eggs/
	dist/
	downloads/
	eggs/
	.eggs/
	lib/
	lib64/
	parts/
	sdist/
	var/
	wheels/
	share/python-wheels/
	*.egg-info/
	.installed.cfg
	*.egg
	MANIFEST

	# PyInstaller
	# Usually these files are written by a python script from a template
	# before PyInstaller builds the exe, so as to inject date/other infos into it.
	*.manifest
	*.spec

	# Installer logs
	pip-log.txt
	pip-delete-this-directory.txt

	# Unit test / coverage reports
	htmlcov/
	.tox/
	.nox/
	.coverage
	.coverage.*
	.cache
	nosetests.xml
	coverage.xml
	*.cover
	*.py,cover
	.hypothesis/
	.pytest_cache/
	cover/

	# Translations
	*.mo
	*.pot

	# Django stuff:
	*.log
	local_settings.py
	db.sqlite3
	db.sqlite3-journal

	# Flask stuff:
	instance/
	.webassets-cache

	# Scrapy stuff:
	.scrapy

	# Sphinx documentation
	docs/_build/

	# PyBuilder
	.pybuilder/
	target/

	# Jupyter Notebook

	# IPython

	# pyenv
	# For a library or package, you might want to ignore these files since the code is
	# intended to run in multiple environments; otherwise, check them in:
	# .python-version

	# pipenv
	# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
	# However, in case of collaboration, if having platform-specific dependencies or dependencies
	# having no cross-platform support, pipenv may install dependencies that don't work, or not
	# install all needed dependencies.
	#Pipfile.lock

	# poetry
	# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
	# This is especially recommended for binary packages to ensure reproducibility, and is more
	# commonly ignored for libraries.
	# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
	#poetry.lock

	# pdm
	# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
	#pdm.lock
	# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
	# in version control.
	# https://pdm.fming.dev/#use-with-ide
	.pdm.toml

	# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
	__pypackages__/

	# Celery stuff
	celerybeat-schedule
	celerybeat.pid

	# SageMath parsed files
	*.sage.py

	# Environments
	.env
	.venv
	env/
	venv/
	ENV/
	env.bak/
	venv.bak/

	# Spyder project settings
	.spyderproject
	.spyproject

	# Rope project settings
	.ropeproject

	# mkdocs documentation
	/site

	# mypy
	.mypy_cache/
	.dmypy.json
	dmypy.json

	# Pyre type checker
	.pyre/

	# pytype static type analyzer
	.pytype/

	# Cython debug symbols
	cython_debug/

	# PyCharm
	# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
	# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
	# and can be added to the global gitignore or merged into this file. For a more nuclear
	# option (not recommended) you can uncomment the following to ignore the entire idea folder.
	#.idea/

	### Python Patch ###
	# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
	poetry.toml

	# ruff
	.ruff_cache/

	# LSP config files
	pyrightconfig.json

	### VisualStudioCode ###
	.vscode/*
	!.vscode/settings.json
	!.vscode/tasks.json
	!.vscode/launch.json
	!.vscode/extensions.json
	!.vscode/*.code-snippets

	# Local History for Visual Studio Code
	.history/

	# Built Visual Studio Code Extensions
	*.vsix

	### VisualStudioCode Patch ###
	# Ignore all local history of files
	.history
	.ionide

	# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica
	name: pyspark-intro
	channels:
	- conda-forge
	dependencies:
	- python=3.11
	- pyspark>=3.4
	- openjdk=17
	- pandas>=2.0.1
	- pyarrow
	# for testdata
	- numpy
	- seaborn
	# jupyter
	- jupyterlab