- Linux環境でMambaforgeを使って環境を構築する
- クラスター管理などはおいておいて、学習用の最低限の環境を手軽に作る
- python3.11
- pyspark3.4.0
- openjdk17
- pandas2.0.1
- pandasとの連携を確認するため
- その他、jupyterなど
# use mambaforge
mamba env create -f pyspark-env.yaml| testdata | |
| # Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all | |
| # Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all | |
| ### JupyterNotebooks ### | |
| # gitignore template for Jupyter Notebooks | |
| # website: http://jupyter.org/ | |
| .ipynb_checkpoints | |
| */.ipynb_checkpoints/* | |
| # IPython | |
| profile_default/ | |
| ipython_config.py | |
| # Remove previous ipynb_checkpoints | |
| # git rm -r .ipynb_checkpoints/ | |
| ### PyCharm+all ### | |
| # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider | |
| # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 | |
| # User-specific stuff | |
| .idea/**/workspace.xml | |
| .idea/**/tasks.xml | |
| .idea/**/usage.statistics.xml | |
| .idea/**/dictionaries | |
| .idea/**/shelf | |
| # AWS User-specific | |
| .idea/**/aws.xml | |
| # Generated files | |
| .idea/**/contentModel.xml | |
| # Sensitive or high-churn files | |
| .idea/**/dataSources/ | |
| .idea/**/dataSources.ids | |
| .idea/**/dataSources.local.xml | |
| .idea/**/sqlDataSources.xml | |
| .idea/**/dynamic.xml | |
| .idea/**/uiDesigner.xml | |
| .idea/**/dbnavigator.xml | |
| # Gradle | |
| .idea/**/gradle.xml | |
| .idea/**/libraries | |
| # Gradle and Maven with auto-import | |
| # When using Gradle or Maven with auto-import, you should exclude module files, | |
| # since they will be recreated, and may cause churn. Uncomment if using | |
| # auto-import. | |
| # .idea/artifacts | |
| # .idea/compiler.xml | |
| # .idea/jarRepositories.xml | |
| # .idea/modules.xml | |
| # .idea/*.iml | |
| # .idea/modules | |
| # *.iml | |
| # *.ipr | |
| # CMake | |
| cmake-build-*/ | |
| # Mongo Explorer plugin | |
| .idea/**/mongoSettings.xml | |
| # File-based project format | |
| *.iws | |
| # IntelliJ | |
| out/ | |
| # mpeltonen/sbt-idea plugin | |
| .idea_modules/ | |
| # JIRA plugin | |
| atlassian-ide-plugin.xml | |
| # Cursive Clojure plugin | |
| .idea/replstate.xml | |
| # SonarLint plugin | |
| .idea/sonarlint/ | |
| # Crashlytics plugin (for Android Studio and IntelliJ) | |
| com_crashlytics_export_strings.xml | |
| crashlytics.properties | |
| crashlytics-build.properties | |
| fabric.properties | |
| # Editor-based Rest Client | |
| .idea/httpRequests | |
| # Android studio 3.1+ serialized cache file | |
| .idea/caches/build_file_checksums.ser | |
| ### PyCharm+all Patch ### | |
| # Ignore everything but code style settings and run configurations | |
| # that are supposed to be shared within teams. | |
| .idea/* | |
| !.idea/codeStyles | |
| !.idea/runConfigurations | |
| ### Python ### | |
| # Byte-compiled / optimized / DLL files | |
| __pycache__/ | |
| *.py[cod] | |
| *$py.class | |
| # C extensions | |
| *.so | |
| # Distribution / packaging | |
| .Python | |
| build/ | |
| develop-eggs/ | |
| dist/ | |
| downloads/ | |
| eggs/ | |
| .eggs/ | |
| lib/ | |
| lib64/ | |
| parts/ | |
| sdist/ | |
| var/ | |
| wheels/ | |
| share/python-wheels/ | |
| *.egg-info/ | |
| .installed.cfg | |
| *.egg | |
| MANIFEST | |
| # PyInstaller | |
| # Usually these files are written by a python script from a template | |
| # before PyInstaller builds the exe, so as to inject date/other infos into it. | |
| *.manifest | |
| *.spec | |
| # Installer logs | |
| pip-log.txt | |
| pip-delete-this-directory.txt | |
| # Unit test / coverage reports | |
| htmlcov/ | |
| .tox/ | |
| .nox/ | |
| .coverage | |
| .coverage.* | |
| .cache | |
| nosetests.xml | |
| coverage.xml | |
| *.cover | |
| *.py,cover | |
| .hypothesis/ | |
| .pytest_cache/ | |
| cover/ | |
| # Translations | |
| *.mo | |
| *.pot | |
| # Django stuff: | |
| *.log | |
| local_settings.py | |
| db.sqlite3 | |
| db.sqlite3-journal | |
| # Flask stuff: | |
| instance/ | |
| .webassets-cache | |
| # Scrapy stuff: | |
| .scrapy | |
| # Sphinx documentation | |
| docs/_build/ | |
| # PyBuilder | |
| .pybuilder/ | |
| target/ | |
| # Jupyter Notebook | |
| # IPython | |
| # pyenv | |
| # For a library or package, you might want to ignore these files since the code is | |
| # intended to run in multiple environments; otherwise, check them in: | |
| # .python-version | |
| # pipenv | |
| # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | |
| # However, in case of collaboration, if having platform-specific dependencies or dependencies | |
| # having no cross-platform support, pipenv may install dependencies that don't work, or not | |
| # install all needed dependencies. | |
| #Pipfile.lock | |
| # poetry | |
| # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. | |
| # This is especially recommended for binary packages to ensure reproducibility, and is more | |
| # commonly ignored for libraries. | |
| # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control | |
| #poetry.lock | |
| # pdm | |
| # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. | |
| #pdm.lock | |
| # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it | |
| # in version control. | |
| # https://pdm.fming.dev/#use-with-ide | |
| .pdm.toml | |
| # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm | |
| __pypackages__/ | |
| # Celery stuff | |
| celerybeat-schedule | |
| celerybeat.pid | |
| # SageMath parsed files | |
| *.sage.py | |
| # Environments | |
| .env | |
| .venv | |
| env/ | |
| venv/ | |
| ENV/ | |
| env.bak/ | |
| venv.bak/ | |
| # Spyder project settings | |
| .spyderproject | |
| .spyproject | |
| # Rope project settings | |
| .ropeproject | |
| # mkdocs documentation | |
| /site | |
| # mypy | |
| .mypy_cache/ | |
| .dmypy.json | |
| dmypy.json | |
| # Pyre type checker | |
| .pyre/ | |
| # pytype static type analyzer | |
| .pytype/ | |
| # Cython debug symbols | |
| cython_debug/ | |
| # PyCharm | |
| # JetBrains specific template is maintained in a separate JetBrains.gitignore that can | |
| # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore | |
| # and can be added to the global gitignore or merged into this file. For a more nuclear | |
| # option (not recommended) you can uncomment the following to ignore the entire idea folder. | |
| #.idea/ | |
| ### Python Patch ### | |
| # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration | |
| poetry.toml | |
| # ruff | |
| .ruff_cache/ | |
| # LSP config files | |
| pyrightconfig.json | |
| ### VisualStudioCode ### | |
| .vscode/* | |
| !.vscode/settings.json | |
| !.vscode/tasks.json | |
| !.vscode/launch.json | |
| !.vscode/extensions.json | |
| !.vscode/*.code-snippets | |
| # Local History for Visual Studio Code | |
| .history/ | |
| # Built Visual Studio Code Extensions | |
| *.vsix | |
| ### VisualStudioCode Patch ### | |
| # Ignore all local history of files | |
| .history | |
| .ionide | |
| # End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all |
pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、 メモリに乗り切らないレベルのデータ量は処理できない。 また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。
遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、 更にpandasとの連携・併用ができるツールとしてpysparkが存在するため紹介する。
なお、詳細はPySpark Documentationなども参照のこと
Mambaforgeをインストール済みのLinux環境を前提とする。
以下のようなconda仮想環境作成用のyamlを用意する:
# pyspark_env.yaml
name: pyspark-intro
channels:
- conda-forge
dependencies:
- python=3.11
- pyspark>=3.4
- openjdk=17
- pandas>=2.0.1
- pyarrow
# for testdata
- numpy
- seaborn
# jupyter
- jupyterlab※先日pandasの2.0がリリースされたが、
deprecatedになっていた
iteritemsが削除された影響で、
pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。pyspark>=3.4.0で修正済みなので、pandasも2系を使う
mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)
mamba env create -f pyspark_env.yaml
# 出来上がった環境(pyspark-intro)をactivate
conda activate pyspark-intro
# or `mamba activate pyspark-intro`あたりを簡単に確認する。
データ処理周りの詳細は省略するが、Sparkのデータフレーム機能がSpark SQLと対応していることから、 概ねSQLで出来ることようなことを実行出来ると考えればOK。(というかSQLでも書ける)
# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)
!rm -rf testdata && mkdir testdatafrom pyspark.sql.session import SparkSession
# spark sessionの立ち上げ
spark = (SparkSession.builder
.master("local[*]")
.appName("LocalTest")
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.config("spark.executor.memory", "4g") # TMP (実際はハードコートはあまり良くない)
.getOrCreate()
)
sparkyour 131072x1 screen size is bogus. expect trouble
23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)
23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
<div>
<p><b>SparkSession - in-memory</b></p>
SparkContext
<p><a href="http://172.24.210.1:4040">Spark UI</a></p>
<dl>
<dt>Version</dt>
<dd><code>v3.4.0</code></dd>
<dt>Master</dt>
<dd><code>local[*]</code></dd>
<dt>AppName</dt>
<dd><code>LocalTest</code></dd>
</dl>
</div>
↑config部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)
参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion
なお、詳細かつ環境依存性のあるconfigは別途configファイル($SPARK_HOME/conf/spark-defaults.conf)を作成するなどして設定すると良い
コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html
また、コンフィグファイルの置き場所の設定などに必要な環境変数SPARK_HOMEについて、condaでインストールした場合の設定方法などはここなどを参照
Spark Sessionを起動すると、デフォルトではhttp://localhost:4040にSpark UIが立ち上がり、 実行状態などをモニタリングすることが出来る(デバッグなどにも便利)
# テストデータの作成
import seaborn as sns
iris = sns.load_dataset("iris")
display(iris)
iris.to_csv("testdata/iris.csv", index=False).dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
# pysparkでcsvを読み込み
df_iris = spark.read.csv(
"testdata/iris.csv",
header=True,
inferSchema=True,
)
# 遅延評価するので、カラムのスキーマのみ表示される
display(df_iris)DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)
※(補足):
# showなどによって初めて評価が行われる
df_iris.show(5) # 5件を表示+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# selectによるカラムの選択
(df_iris
.select(["sepal_length", "sepal_width"])
.show(3)
)+------------+-----------+
|sepal_length|sepal_width|
+------------+-----------+
| 5.1| 3.5|
| 4.9| 3.0|
| 4.7| 3.2|
+------------+-----------+
only showing top 3 rows
# filterによる簡単なクエリ
(df_iris
.filter(df_iris.species == "virginica")
.filter(df_iris.sepal_length > 5.0)
.show(3)
)+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 6.3| 3.3| 6.0| 2.5|virginica|
| 5.8| 2.7| 5.1| 1.9|virginica|
| 7.1| 3.0| 5.9| 2.1|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
# groupByによる集約(1)
(df_iris
.groupBy('species')
.count()
.show()
)+----------+-----+
| species|count|
+----------+-----+
| virginica| 50|
|versicolor| 50|
| setosa| 50|
+----------+-----+
# groupByによる集約(2)
(df_iris
.groupBy('species')
.agg({
'sepal_length': 'mean',
'sepal_width': 'mean',
'petal_length': 'mean',
'petal_width': 'mean',
})
.show()
)+----------+------------------+------------------+-----------------+------------------+
| species| avg(sepal_width)| avg(petal_width)|avg(sepal_length)| avg(petal_length)|
+----------+------------------+------------------+-----------------+------------------+
| virginica|2.9739999999999998| 2.026|6.587999999999998| 5.552|
|versicolor|2.7700000000000005|1.3259999999999998| 5.936| 4.26|
| setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|
+----------+------------------+------------------+-----------------+------------------+
# DataFrameのサマリー
df_iris.describe().show()23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-------+------------------+-------------------+------------------+------------------+---------+
|summary| sepal_length| sepal_width| petal_length| petal_width| species|
+-------+------------------+-------------------+------------------+------------------+---------+
| count| 150| 150| 150| 150| 150|
| mean| 5.843333333333335| 3.057333333333334|3.7580000000000027| 1.199333333333334| null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467| null|
| min| 4.3| 2.0| 1.0| 0.1| setosa|
| max| 7.9| 4.4| 6.9| 2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+
# SQLでクエリを書くことも可能
df_iris.createOrReplaceTempView('iris')
spark.sql("show tables").show()
tmp = spark.sql("SELECT * FROM iris WHERE species = 'setosa'")
tmp.show()+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
| | iris| true|
+---------+---------+-----------+
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
詳細はUser Guide, API Reference, およびSpark SQLのリファレンスを参照
次に、データのRead/Writeを確認する
# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する
display(iris.__class__) # irisはpandas dataframe
df = spark.createDataFrame(iris)
display(df)
df.show()pandas.core.frame.DataFrame
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
# spark dataframeからpandas dataframeへの変換
display(df.__class__) # pyspark.sql.dataframe.DataFrame
pdf = df.toPandas()
display(pdf)pyspark.sql.dataframe.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったらtoPandasをして扱いやすいpandasに変換する、という使い方が出来る。
最後に色々な形式でデータの書き込みと読み込みを行う
先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。
# dfはspark dataframe
# csv.gz
df.write.save("testdata/iris_csv", format="csv")
# formatオプションを使う代わりに、↓のように書いてもOK
# df.write.csv("testdata/iris_csv", compression="gzip")
!ls testdata/iris_csv_SUCCESS
part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
!head testdata/iris_csv/part-00000-*.csv5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。 (ファイルが分散して生成される)
読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、 基本は保存したディレクトリを指定して読み込む
df_csv = spark.read.load("testdata/iris_csv", format="csv")
# formatを指定せず、↓のようにすることも可能
# df_csv = spark.read.csv("testdata/iris_csv")
df_csv.show(3)+---+---+---+---+----------+
|_c0|_c1|_c2|_c3| _c4|
+---+---+---+---+----------+
|6.3|3.3|4.7|1.6|versicolor|
|4.9|2.4|3.3|1.0|versicolor|
|6.6|2.9|4.6|1.3|versicolor|
+---+---+---+---+----------+
only showing top 3 rows
(↑csvだとカラム名などの情報が欠落しがちで使いづらい)
# テキスト形式だとcsvの他にjsonなども指定可能
# compressionで圧縮形式を指定することも出来る
df.write.json("testdata/iris_json", compression='gzip')
!ls testdata/iris_json_SUCCESS
part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
# 読み込みも同様
df_json = spark.read.json("testdata/iris_json")
# or df_json = spark.read.load("testdata/iris_json", format="json")
display(df_json)
df_json.show(3)DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|petal_length|petal_width|sepal_length|sepal_width| species|
+------------+-----------+------------+-----------+----------+
| 3.3| 1.0| 5.0| 2.3|versicolor|
| 4.2| 1.3| 5.6| 2.7|versicolor|
| 4.2| 1.2| 5.7| 3.0|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
(↑jsonだとcsvよりは型情報が保持される)
# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など
df.write.parquet("testdata/iris_parquet", compression="snappy")
!ls testdata/iris_parquet
df.write.orc("testdata/iris_orc", compression="zlib")
!ls testdata/iris_orc23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
_SUCCESS
part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
_SUCCESS
part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
# 読み込み
# parquet
print("parquet:")
df_parquet = spark.read.parquet("testdata/iris_parquet")
display(df_parquet)
df_parquet.show(3)
print("\n------------------\n")
# orc
print("orc:")
df_orc = spark.read.orc("testdata/iris_orc")
display(df_orc)
df_orc.show(3)parquet:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
------------------
orc:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
# partitionを切って保存することも可能
df.write.save(
"testdata/iris_with_partition",
format="parquet",
compression="snappy",
partitionBy="species"
)
!cd testdata/iris_with_partition && tree�[00;38;5;33m.�[0m
├── _SUCCESS
├── �[00;38;5;33mspecies=setosa�[0m
│ ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── �[00;38;5;33mspecies=versicolor�[0m
│ ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── �[00;38;5;33mspecies=virginica�[0m
├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
4 directories, 11 files
カラム"species"の値ごとに別ディレクトリが切られてデータが保存される
RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る(し、不適切だと悪化する)
↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。 一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。
# パーティションがある場合でも読み込み方法は同様
df_partition = spark.read.load("testdata/iris_with_partition")
display(df_partition)
df_partition.show(3)DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 7.9| 3.8| 6.4| 2.0|virginica|
| 6.4| 2.8| 5.6| 2.2|virginica|
| 6.3| 2.8| 5.1| 1.5|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
他にもsaveAsTableで永続化したテーブルとして書き込みを行ったりも出来る。
扱えるデータ保存形式もData Sourcesのように、 他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。
import pyspark.pandas as ps/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.
warnings.warn(
参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html
最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子
なお、Koalasという、sparkをpandasっぽく使えるツールが存在するが、 ここなどを見るとこれが正式にpysparkに取り込まれた?様子。
import pyspark.pandas as ps
# pandas on spark DataFrameを作成する
psdf = ps.read_csv("testdata/iris.csv")
display(psdf.__class__) # pyspark.pandas.frame.DataFrame
psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pyspark.pandas.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# filterや表示も通常のpandasのようにできている
psdf[psdf['sepal_length'] > 7.5].dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 105 | 7.6 | 3.0 | 6.6 | 2.1 | virginica |
| 117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
| 118 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
| 122 | 7.7 | 2.8 | 6.7 | 2.0 | virginica |
| 131 | 7.9 | 3.8 | 6.4 | 2.0 | virginica |
| 135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
# pysparkのDataFrameに変換
sdf = psdf.to_spark(index_col=["index"])
sdf.show(5)+-----+------------+-----------+------------+-----------+-------+
|index|sepal_length|sepal_width|petal_length|petal_width|species|
+-----+------------+-----------+------------+-----------+-------+
| 0| 5.1| 3.5| 1.4| 0.2| setosa|
| 1| 4.9| 3.0| 1.4| 0.2| setosa|
| 2| 4.7| 3.2| 1.3| 0.2| setosa|
| 3| 4.6| 3.1| 1.5| 0.2| setosa|
| 4| 5.0| 3.6| 1.4| 0.2| setosa|
+-----+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# 普通のpandasに変換
pdf = psdf.to_pandas()
display(pdf.__class__) # pandas.core.frame.DataFrame
pdf.head()/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pandas.core.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。
例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、 データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。
pysparkはdaskに比べると、
| name: pyspark-intro | |
| channels: | |
| - conda-forge | |
| dependencies: | |
| - python=3.11 | |
| - pyspark>=3.4 | |
| - openjdk=17 | |
| - pandas>=2.0.1 | |
| - pyarrow | |
| # for testdata | |
| - numpy | |
| - seaborn | |
| # jupyter | |
| - jupyterlab |
| name: pyspark-intro | |
| channels: | |
| - conda-forge | |
| dependencies: | |
| - _libgcc_mutex=0.1=conda_forge | |
| - _openmp_mutex=4.5=2_gnu | |
| - aiofiles=22.1.0=pyhd8ed1ab_0 | |
| - aiosqlite=0.18.0=pyhd8ed1ab_0 | |
| - alsa-lib=1.2.8=h166bdaf_0 | |
| - anyio=3.6.2=pyhd8ed1ab_0 | |
| - argon2-cffi=21.3.0=pyhd8ed1ab_0 | |
| - argon2-cffi-bindings=21.2.0=py311hd4cff14_3 | |
| - arrow-cpp=11.0.0=ha770c72_14_cpu | |
| - asttokens=2.2.1=pyhd8ed1ab_0 | |
| - attrs=22.2.0=pyh71513ae_0 | |
| - aws-c-auth=0.6.26=hf365957_1 | |
| - aws-c-cal=0.5.21=h48707d8_2 | |
| - aws-c-common=0.8.14=h0b41bf4_0 | |
| - aws-c-compression=0.2.16=h03acc5a_5 | |
| - aws-c-event-stream=0.2.20=h00877a2_4 | |
| - aws-c-http=0.7.6=hf342b9f_0 | |
| - aws-c-io=0.13.19=h5b20300_3 | |
| - aws-c-mqtt=0.8.6=hc4349f7_12 | |
| - aws-c-s3=0.2.7=h909e904_1 | |
| - aws-c-sdkutils=0.1.8=h03acc5a_0 | |
| - aws-checksums=0.1.14=h03acc5a_5 | |
| - aws-crt-cpp=0.19.8=hf7fbfca_12 | |
| - aws-sdk-cpp=1.10.57=h17c43bd_8 | |
| - babel=2.12.1=pyhd8ed1ab_1 | |
| - backcall=0.2.0=pyh9f0ad1d_0 | |
| - backports=1.0=pyhd8ed1ab_3 | |
| - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 | |
| - beautifulsoup4=4.12.2=pyha770c72_0 | |
| - bleach=6.0.0=pyhd8ed1ab_0 | |
| - brotli=1.0.9=h166bdaf_8 | |
| - brotli-bin=1.0.9=h166bdaf_8 | |
| - brotlipy=0.7.0=py311hd4cff14_1005 | |
| - bzip2=1.0.8=h7f98852_4 | |
| - c-ares=1.18.1=h7f98852_0 | |
| - ca-certificates=2022.12.7=ha878542_0 | |
| - cairo=1.16.0=h35add3b_1015 | |
| - certifi=2022.12.7=pyhd8ed1ab_0 | |
| - cffi=1.15.1=py311h409f033_3 | |
| - charset-normalizer=3.1.0=pyhd8ed1ab_0 | |
| - comm=0.1.3=pyhd8ed1ab_0 | |
| - contourpy=1.0.7=py311ha3edf6b_0 | |
| - cryptography=40.0.2=py311h9b4c7bb_0 | |
| - cycler=0.11.0=pyhd8ed1ab_0 | |
| - debugpy=1.6.7=py311hcafe171_0 | |
| - decorator=5.1.1=pyhd8ed1ab_0 | |
| - defusedxml=0.7.1=pyhd8ed1ab_0 | |
| - entrypoints=0.4=pyhd8ed1ab_0 | |
| - executing=1.2.0=pyhd8ed1ab_0 | |
| - expat=2.5.0=hcb278e6_1 | |
| - flit-core=3.8.0=pyhd8ed1ab_0 | |
| - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 | |
| - font-ttf-inconsolata=3.000=h77eed37_0 | |
| - font-ttf-source-code-pro=2.038=h77eed37_0 | |
| - font-ttf-ubuntu=0.83=hab24e00_0 | |
| - fontconfig=2.14.2=h14ed4e7_0 | |
| - fonts-conda-ecosystem=1=0 | |
| - fonts-conda-forge=1=0 | |
| - fonttools=4.39.3=py311h2582759_0 | |
| - freetype=2.12.1=hca18f0e_1 | |
| - gettext=0.21.1=h27087fc_0 | |
| - gflags=2.2.2=he1b5a44_1004 | |
| - giflib=5.2.1=h0b41bf4_3 | |
| - glog=0.6.0=h6f12383_0 | |
| - graphite2=1.3.13=h58526e2_1001 | |
| - harfbuzz=6.0.0=h3ff4399_1 | |
| - icu=72.1=hcb278e6_0 | |
| - idna=3.4=pyhd8ed1ab_0 | |
| - importlib-metadata=6.6.0=pyha770c72_0 | |
| - importlib_metadata=6.6.0=hd8ed1ab_0 | |
| - importlib_resources=5.12.0=pyhd8ed1ab_0 | |
| - ipykernel=6.22.0=pyh210e3f2_0 | |
| - ipython=8.12.0=pyh41d4057_0 | |
| - ipython_genutils=0.2.0=py_1 | |
| - jedi=0.18.2=pyhd8ed1ab_0 | |
| - jinja2=3.1.2=pyhd8ed1ab_1 | |
| - json5=0.9.5=pyh9f0ad1d_0 | |
| - jsonschema=4.17.3=pyhd8ed1ab_0 | |
| - jupyter_client=8.2.0=pyhd8ed1ab_0 | |
| - jupyter_core=5.3.0=py311h38be061_0 | |
| - jupyter_events=0.6.3=pyhd8ed1ab_0 | |
| - jupyter_server=2.5.0=pyhd8ed1ab_0 | |
| - jupyter_server_fileid=0.9.0=pyhd8ed1ab_0 | |
| - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1 | |
| - jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0 | |
| - jupyter_ydoc=0.2.3=pyhd8ed1ab_0 | |
| - jupyterlab=3.6.3=pyhd8ed1ab_0 | |
| - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 | |
| - jupyterlab_server=2.22.1=pyhd8ed1ab_0 | |
| - keyutils=1.6.1=h166bdaf_0 | |
| - kiwisolver=1.4.4=py311h4dd048b_1 | |
| - krb5=1.20.1=h81ceb04_0 | |
| - lcms2=2.15=haa2dc70_1 | |
| - ld_impl_linux-64=2.40=h41732ed_0 | |
| - lerc=4.0.0=h27087fc_0 | |
| - libabseil=20230125.0=cxx17_hcb278e6_1 | |
| - libarrow=11.0.0=h93537a5_14_cpu | |
| - libblas=3.9.0=16_linux64_openblas | |
| - libbrotlicommon=1.0.9=h166bdaf_8 | |
| - libbrotlidec=1.0.9=h166bdaf_8 | |
| - libbrotlienc=1.0.9=h166bdaf_8 | |
| - libcblas=3.9.0=16_linux64_openblas | |
| - libcrc32c=1.1.2=h9c3ff4c_0 | |
| - libcups=2.3.3=h36d4200_3 | |
| - libcurl=8.0.1=h588be90_0 | |
| - libdeflate=1.18=h0b41bf4_0 | |
| - libedit=3.1.20191231=he28a2e2_2 | |
| - libev=4.33=h516909a_1 | |
| - libevent=2.1.10=h28343ad_4 | |
| - libexpat=2.5.0=hcb278e6_1 | |
| - libffi=3.4.2=h7f98852_5 | |
| - libgcc-ng=12.2.0=h65d4601_19 | |
| - libgfortran-ng=12.2.0=h69a702a_19 | |
| - libgfortran5=12.2.0=h337968e_19 | |
| - libglib=2.76.2=hebfc3b9_0 | |
| - libgomp=12.2.0=h65d4601_19 | |
| - libgoogle-cloud=2.8.0=h0bc5f78_1 | |
| - libgrpc=1.52.1=hcf146ea_1 | |
| - libiconv=1.17=h166bdaf_0 | |
| - libjpeg-turbo=2.1.5.1=h0b41bf4_0 | |
| - liblapack=3.9.0=16_linux64_openblas | |
| - libnghttp2=1.52.0=h61bc06f_0 | |
| - libnsl=2.0.0=h7f98852_0 | |
| - libnuma=2.0.16=h0b41bf4_1 | |
| - libopenblas=0.3.21=pthreads_h78a6416_3 | |
| - libpng=1.6.39=h753d276_0 | |
| - libprotobuf=3.21.12=h3eb15da_0 | |
| - libsodium=1.0.18=h36c2ea0_1 | |
| - libsqlite=3.40.0=h753d276_1 | |
| - libssh2=1.10.0=hf14f497_3 | |
| - libstdcxx-ng=12.2.0=h46fd767_19 | |
| - libthrift=0.18.1=h5e4af38_0 | |
| - libtiff=4.5.0=ha587672_6 | |
| - libutf8proc=2.8.0=h166bdaf_0 | |
| - libuuid=2.38.1=h0b41bf4_0 | |
| - libwebp-base=1.3.0=h0b41bf4_0 | |
| - libxcb=1.13=h7f98852_1004 | |
| - libzlib=1.2.13=h166bdaf_4 | |
| - lz4-c=1.9.4=hcb278e6_0 | |
| - markupsafe=2.1.2=py311h2582759_0 | |
| - matplotlib-base=3.7.1=py311h8597a09_0 | |
| - matplotlib-inline=0.1.6=pyhd8ed1ab_0 | |
| - mistune=2.0.5=pyhd8ed1ab_0 | |
| - munkres=1.1.4=pyh9f0ad1d_0 | |
| - nbclassic=0.5.5=pyhb4ecaf3_1 | |
| - nbclient=0.7.3=pyhd8ed1ab_0 | |
| - nbconvert=7.3.1=pyhd8ed1ab_0 | |
| - nbconvert-core=7.3.1=pyhd8ed1ab_0 | |
| - nbconvert-pandoc=7.3.1=pyhd8ed1ab_0 | |
| - nbformat=5.8.0=pyhd8ed1ab_0 | |
| - ncurses=6.3=h27087fc_1 | |
| - nest-asyncio=1.5.6=pyhd8ed1ab_0 | |
| - notebook=6.5.4=pyha770c72_0 | |
| - notebook-shim=0.2.2=pyhd8ed1ab_0 | |
| - numpy=1.24.3=py311h64a7726_0 | |
| - openjdk=17.0.3=h4335b31_6 | |
| - openjpeg=2.5.0=hfec8fc6_2 | |
| - openssl=3.1.0=hd590300_1 | |
| - orc=1.8.3=hfdbbad2_0 | |
| - packaging=23.1=pyhd8ed1ab_0 | |
| - pandas=2.0.1=py311h320fe9a_0 | |
| - pandoc=2.19.2=h32600fe_2 | |
| - pandocfilters=1.5.0=pyhd8ed1ab_0 | |
| - parquet-cpp=1.5.1=2 | |
| - parso=0.8.3=pyhd8ed1ab_0 | |
| - patsy=0.5.3=pyhd8ed1ab_0 | |
| - pcre2=10.40=hc3806b6_0 | |
| - pexpect=4.8.0=pyh1a96a4e_2 | |
| - pickleshare=0.7.5=py_1003 | |
| - pillow=9.5.0=py311h573f0d3_0 | |
| - pip=23.1.1=pyhd8ed1ab_0 | |
| - pixman=0.40.0=h36c2ea0_0 | |
| - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 | |
| - platformdirs=3.2.0=pyhd8ed1ab_0 | |
| - pooch=1.7.0=pyha770c72_3 | |
| - prometheus_client=0.16.0=pyhd8ed1ab_0 | |
| - prompt-toolkit=3.0.38=pyha770c72_0 | |
| - prompt_toolkit=3.0.38=hd8ed1ab_0 | |
| - psutil=5.9.5=py311h2582759_0 | |
| - pthread-stubs=0.4=h36c2ea0_1001 | |
| - ptyprocess=0.7.0=pyhd3deb0d_0 | |
| - pure_eval=0.2.2=pyhd8ed1ab_0 | |
| - py4j=0.10.9.7=pyhd8ed1ab_0 | |
| - pyarrow=11.0.0=py311hbdf6286_14_cpu | |
| - pycparser=2.21=pyhd8ed1ab_0 | |
| - pygments=2.15.1=pyhd8ed1ab_0 | |
| - pyopenssl=23.1.1=pyhd8ed1ab_0 | |
| - pyparsing=3.0.9=pyhd8ed1ab_0 | |
| - pyrsistent=0.19.3=py311h2582759_0 | |
| - pysocks=1.7.1=pyha2e5f31_6 | |
| - pyspark=3.4.0=pyhd8ed1ab_0 | |
| - python=3.11.3=h2755cc3_0_cpython | |
| - python-dateutil=2.8.2=pyhd8ed1ab_0 | |
| - python-fastjsonschema=2.16.3=pyhd8ed1ab_0 | |
| - python-json-logger=2.0.7=pyhd8ed1ab_0 | |
| - python-tzdata=2023.3=pyhd8ed1ab_0 | |
| - python_abi=3.11=3_cp311 | |
| - pytz=2023.3=pyhd8ed1ab_0 | |
| - pyyaml=6.0=py311hd4cff14_5 | |
| - pyzmq=25.0.2=py311hd6ccaeb_0 | |
| - re2=2023.02.02=hcb278e6_0 | |
| - readline=8.2=h8228510_1 | |
| - requests=2.28.2=pyhd8ed1ab_1 | |
| - rfc3339-validator=0.1.4=pyhd8ed1ab_0 | |
| - rfc3986-validator=0.1.1=pyh9f0ad1d_0 | |
| - s2n=1.3.41=h3358134_0 | |
| - scipy=1.10.1=py311h8e6699e_0 | |
| - seaborn=0.12.2=hd8ed1ab_0 | |
| - seaborn-base=0.12.2=pyhd8ed1ab_0 | |
| - send2trash=1.8.0=pyhd8ed1ab_0 | |
| - setuptools=67.7.1=pyhd8ed1ab_0 | |
| - six=1.16.0=pyh6c4a22f_0 | |
| - snappy=1.1.10=h9fff704_0 | |
| - sniffio=1.3.0=pyhd8ed1ab_0 | |
| - soupsieve=2.3.2.post1=pyhd8ed1ab_0 | |
| - stack_data=0.6.2=pyhd8ed1ab_0 | |
| - statsmodels=0.13.5=py311h4c7f6c3_2 | |
| - terminado=0.17.1=pyh41d4057_0 | |
| - tinycss2=1.2.1=pyhd8ed1ab_0 | |
| - tk=8.6.12=h27826a3_0 | |
| - tomli=2.0.1=pyhd8ed1ab_0 | |
| - tornado=6.3=py311h2582759_0 | |
| - traitlets=5.9.0=pyhd8ed1ab_0 | |
| - typing-extensions=4.5.0=hd8ed1ab_0 | |
| - typing_extensions=4.5.0=pyha770c72_0 | |
| - tzdata=2023c=h71feb2d_0 | |
| - ucx=1.14.0=h8c404fb_1 | |
| - urllib3=1.26.15=pyhd8ed1ab_0 | |
| - wcwidth=0.2.6=pyhd8ed1ab_0 | |
| - webencodings=0.5.1=py_1 | |
| - websocket-client=1.5.1=pyhd8ed1ab_0 | |
| - wheel=0.40.0=pyhd8ed1ab_0 | |
| - xorg-fixesproto=5.0=h7f98852_1002 | |
| - xorg-inputproto=2.3.2=h7f98852_1002 | |
| - xorg-kbproto=1.0.7=h7f98852_1002 | |
| - xorg-libice=1.0.10=h7f98852_0 | |
| - xorg-libsm=1.2.3=hd9c2040_1000 | |
| - xorg-libx11=1.8.4=h0b41bf4_0 | |
| - xorg-libxau=1.0.9=h7f98852_0 | |
| - xorg-libxdmcp=1.1.3=h7f98852_0 | |
| - xorg-libxext=1.3.4=h0b41bf4_2 | |
| - xorg-libxfixes=5.0.3=h7f98852_1004 | |
| - xorg-libxi=1.7.10=h7f98852_0 | |
| - xorg-libxrender=0.9.10=h7f98852_1003 | |
| - xorg-libxtst=1.2.3=h7f98852_1002 | |
| - xorg-recordproto=1.14.2=h7f98852_1002 | |
| - xorg-renderproto=0.11.1=h7f98852_1002 | |
| - xorg-xextproto=7.3.0=h0b41bf4_1003 | |
| - xorg-xproto=7.0.31=h7f98852_1007 | |
| - xz=5.2.6=h166bdaf_0 | |
| - y-py=0.5.9=py311hfe55011_0 | |
| - yaml=0.2.5=h7f98852_2 | |
| - ypy-websocket=0.8.2=pyhd8ed1ab_0 | |
| - zeromq=4.3.4=h9c3ff4c_1 | |
| - zipp=3.15.0=pyhd8ed1ab_0 | |
| - zlib=1.2.13=h166bdaf_4 | |
| - zstd=1.5.2=h3eb15da_6 | |
| # prefix: /path/to/pyspark-intro |