Last active
May 17, 2022 15:03
-
-
Save ian-r-rose/1db04c36a797f0822af9c871368e0efe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "5ca5061d-4d62-4212-8d33-efd64655c3a0", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"# Parquet Performance Comparisons\n", | |
"\n", | |
"In March 2022 we kicked off an effort to improve the parquet user expreience in Dask.\n", | |
"This notebook is intended to measure how we did, both in terms of improvements to default parameters and in performance." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "2af778b2-2a1a-4d8f-baae-56d2d54640d2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'2022.02.1'" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import contextlib\n", | |
"import time\n", | |
"\n", | |
"import coiled\n", | |
"import dask\n", | |
"import dask.dataframe as dd\n", | |
"import distributed\n", | |
"import pandas\n", | |
"import s3fs\n", | |
"from dask.datasets import timeseries\n", | |
"\n", | |
"dask.__version__" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b474bd73-f8e9-4d16-a99f-010894a2e630", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## Utilities and setup\n", | |
"\n", | |
"Here we create two utilities: one for getting timing information of task groups from the scheduler (roughly speaking, thread-seconds), and one for measuring the wall clock time from the client." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "b89843d6-fb86-4e09-b2fa-71b407b0dbd0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from distributed.diagnostics import SchedulerPlugin\n", | |
"from distributed.utils import key_split, key_split_group\n", | |
"\n", | |
"class TaskGroupStatistics(SchedulerPlugin):\n", | |
" \"\"\"\n", | |
" A plugin for collecting task group timing information\n", | |
" from the scheduler.\n", | |
" \"\"\"\n", | |
" def __init__(self):\n", | |
" \"\"\"Initialize the plugin\"\"\"\n", | |
" self.groups = {}\n", | |
" self.scheduler = None\n", | |
"\n", | |
" def start(self, scheduler):\n", | |
" \"\"\"Called on scheduler start as well as on registration time\"\"\"\n", | |
" self.scheduler = scheduler\n", | |
" scheduler.handlers[\"get_task_groups\"] = self.get_task_groups\n", | |
"\n", | |
" def transition(self, key, start, finish, *args, **kwargs):\n", | |
" \"\"\"On key transition to memory, update the task group data\"\"\"\n", | |
" if self.scheduler is None:\n", | |
" # Should not get here if initialization has happened correctly\n", | |
" return\n", | |
"\n", | |
" if start == \"processing\" and finish == \"memory\":\n", | |
" prefix_name = key_split(key)\n", | |
" group_name = key_split_group(key)\n", | |
"\n", | |
" if group_name not in self.groups:\n", | |
" self.groups[group_name] = {}\n", | |
"\n", | |
" group = self.scheduler.task_groups[group_name]\n", | |
" self.groups[group_name][\"prefix\"] = prefix_name\n", | |
" self.groups[group_name][\"duration\"] = group.duration\n", | |
" self.groups[group_name][\"start\"] = str(\n", | |
" datetime.datetime.fromtimestamp(group.start)\n", | |
" )\n", | |
" self.groups[group_name][\"stop\"] = str(\n", | |
" datetime.datetime.fromtimestamp(group.stop)\n", | |
" )\n", | |
" self.groups[group_name][\"nbytes\"] = group.nbytes_total\n", | |
"\n", | |
" async def get_task_groups(self, comm):\n", | |
" return self.groups\n", | |
"\n", | |
" def restart(self, scheduler):\n", | |
" self.groups = {}\n", | |
"\n", | |
"def get_tg_data(client):\n", | |
" tg_data = client.sync(client.scheduler.get_task_groups)\n", | |
"\n", | |
" df = pandas.DataFrame.from_dict(tg_data, orient=\"index\")\n", | |
" df.index.name = \"group\"\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "780290e6-2626-47b5-aed3-7aca17581fe6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from IPython.display import Markdown\n", | |
"\n", | |
"@contextlib.contextmanager\n", | |
"def timer(label=\"Block\"):\n", | |
" \"\"\"\n", | |
" Time a block of code and print out the result when done.\n", | |
" \"\"\"\n", | |
" start = time.time()\n", | |
" yield\n", | |
" end = time.time()\n", | |
" display(Markdown(f\"**{label}** took **{end-start:.2f}** seconds\"))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "82c7e406-dd6a-4142-a2fa-ba843b3799a5", | |
"metadata": {}, | |
"source": [ | |
"## Software environments\n", | |
"\n", | |
"Create two software environments, one from February, and one from May." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "f5075232-55cb-44a9-a74f-bff37eaff538", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Found existing software environment build, returning\n", | |
"Found existing software environment build, returning\n" | |
] | |
} | |
], | |
"source": [ | |
"old=\"2022.2.1\"\n", | |
"new=\"2022.5.0\"\n", | |
"\n", | |
"coiled.create_software_environment(\n", | |
" f\"parquet-{old.replace('.', '-')}\",\n", | |
" conda={\n", | |
" \"channels\": [\"conda-forge\"],\n", | |
" \"dependencies\": [\n", | |
" \"python=3.9\",\n", | |
" f\"dask={old}\",\n", | |
" f\"distributed={old}\",\n", | |
" \"s3fs\",\n", | |
" \"pyarrow=7\",\n", | |
" \"fastparquet=0.8.0\",\n", | |
" ],\n", | |
" }\n", | |
")\n", | |
"\n", | |
"coiled.create_software_environment(\n", | |
" f\"parquet-{new.replace('.', '-')}\",\n", | |
" conda={\n", | |
" \"channels\": [\"conda-forge\"],\n", | |
" \"dependencies\": [\n", | |
" \"python=3.9\",\n", | |
" f\"dask=={new}\",\n", | |
" f\"distributed=={new}\",\n", | |
" \"s3fs\",\n", | |
" \"pyarrow=7\",\n", | |
" \"fastparquet=0.8.1\",\n", | |
" ],\n", | |
" },\n", | |
")\n", | |
"kind = old" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0be911c1-4ad3-4ecd-920f-d83eb3715d84", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## Test #1: naive data read\n", | |
"\n", | |
"Let's read a single year of NYC taxi parquet data from Ursa labs with no changes to default parameters (except to specify that the engine is `pyarrow` so we can make apples-to-apples comparisons)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a07a30f6-0c8a-4e96-9f49-125f31bd8423", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f139c042e490473296d08f5921da2f85", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Output()" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |
], | |
"text/plain": [] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ian/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:1278: VersionMismatchWarning: Mismatched versions found\n", | |
"\n", | |
"+---------+--------+-----------+---------+\n", | |
"| Package | client | scheduler | workers |\n", | |
"+---------+--------+-----------+---------+\n", | |
"| lz4 | 4.0.0 | None | None |\n", | |
"+---------+--------+-----------+---------+\n", | |
" warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/markdown": [ | |
"**Naive read** took **136.77** seconds" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/markdown": [ | |
"### Task Group timing" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>prefix</th>\n", | |
" <th>duration</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>group</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>read-parquet-31043b51c86561bc40fb6007a1fdc33a</th>\n", | |
" <td>read-parquet</td>\n", | |
" <td>1426.325892</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" prefix duration\n", | |
"group \n", | |
"read-parquet-31043b51c86561bc40fb6007a1fdc33a read-parquet 1426.325892" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Create a separate cluster for ursa data in a the same region\n", | |
"cluster = coiled.Cluster(\n", | |
" name=f\"parquet-{kind.replace('.', '-')}\",\n", | |
" software=f\"parquet-{kind.replace('.', '-')}\",\n", | |
" n_workers=12,\n", | |
" worker_vm_types=[\"t3.xlarge\"],\n", | |
" scheduler_vm_types=[\"t3.large\"],\n", | |
" backend_options={\"region\": \"us-east-2\"},\n", | |
")\n", | |
" \n", | |
"client = distributed.Client(cluster)\n", | |
"client.register_scheduler_plugin(TaskGroupStatistics())\n", | |
"\n", | |
"# Note: reads from `s3://ursa-labs-taxi-data` seem to be particularly slow!\n", | |
"# Other buckets don't seem to have that feature (including ones with what are\n", | |
"# notionally the same data). I don't understand why right now.\n", | |
"with timer(\"Naive read\"):\n", | |
" ddf = dd.read_parquet(\"s3://ursa-labs-taxi-data/2012/**.parquet\", engine=\"pyarrow\")\n", | |
"\n", | |
" ddf = ddf.persist()\n", | |
" distributed.wait(ddf)\n", | |
"\n", | |
"display(Markdown(\"### Task Group timing\"))\n", | |
"display(get_tg_data(client))\n", | |
"\n", | |
"client.close()\n", | |
"cluster.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "951d679f-0b7c-411a-ba3e-34159cff11ec", | |
"metadata": { | |
"tags": [] | |
}, | |
"source": [ | |
"## Create a new shared cluster\n", | |
"\n", | |
"We create it in `us-east-1` to have fast communication with our test s3 bucket `s3://dask-io`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "c1519363-da48-48ba-aba6-197a8ec0b00b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"Running with **Dask==2022.2.1**" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "050e488c039e4800b9994841d9a060ce", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Output()" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |
], | |
"text/plain": [] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ian/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:1278: VersionMismatchWarning: Mismatched versions found\n", | |
"\n", | |
"+---------+--------+-----------+---------+\n", | |
"| Package | client | scheduler | workers |\n", | |
"+---------+--------+-----------+---------+\n", | |
"| lz4 | 4.0.0 | None | None |\n", | |
"+---------+--------+-----------+---------+\n", | |
" warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n" | |
] | |
} | |
], | |
"source": [ | |
"display(Markdown(f\"Running with **Dask=={kind}**\"))\n", | |
"\n", | |
"cluster = coiled.Cluster(\n", | |
" name=f\"parquet-{kind.replace('.', '-')}\",\n", | |
" software=f\"parquet-{kind.replace('.', '-')}\",\n", | |
" n_workers=25,\n", | |
" worker_vm_types=[\"m5.2xlarge\"],\n", | |
" scheduler_vm_types=[\"t3.large\"],\n", | |
" backend_options={\"region\": \"us-east-1\"},\n", | |
")\n", | |
" \n", | |
"client = distributed.Client(cluster)\n", | |
"client.register_scheduler_plugin(TaskGroupStatistics())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "560617ac-ed8d-4fa6-88d4-7ff8b213d4b9", | |
"metadata": {}, | |
"source": [ | |
"## Test #2: naive data write\n", | |
"\n", | |
"Let's write a ~200 GB, 700 partition dataset with 100 columns. In older Dask versions the metadata writing will knock down workers, resulting in a frustrating failure at the end of the computation." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "c825ae34-7a90-473f-b3f6-81defaf2192c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "KilledWorker", | |
"evalue": "(\"('metadata-to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2', 0)\", <WorkerState 'tls://10.4.3.14:41219', name: parquet-2022-2-1-worker-126f060ae1, status: closed, memory: 0, processing: 1>)", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mKilledWorker\u001b[0m Traceback (most recent call last)", | |
"Input \u001b[0;32mIn [7]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m timer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNaive write 200 GB\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 2\u001b[0m ddf \u001b[38;5;241m=\u001b[39m timeseries(\n\u001b[1;32m 3\u001b[0m dtypes\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 4\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m{\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m25\u001b[39m)},\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m partition_freq\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1H\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 13\u001b[0m )\n\u001b[0;32m---> 14\u001b[0m \u001b[43mddf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ms3://dask-io/parquet-performance-200GB/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/dask/dataframe/core.py:4825\u001b[0m, in \u001b[0;36mDataFrame.to_parquet\u001b[0;34m(self, path, *args, **kwargs)\u001b[0m\n\u001b[1;32m 4822\u001b[0m \u001b[38;5;124;03m\"\"\"See dd.to_parquet docstring for more information\"\"\"\u001b[39;00m\n\u001b[1;32m 4823\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m to_parquet\n\u001b[0;32m-> 4825\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:840\u001b[0m, in \u001b[0;36mto_parquet\u001b[0;34m(df, path, engine, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, name_function, **kwargs)\u001b[0m\n\u001b[1;32m 838\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(meta_name, dsk, dependencies\u001b[38;5;241m=\u001b[39m(data_write,))\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m compute:\n\u001b[0;32m--> 840\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompute_as_if_collection\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43mScalar\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmeta_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcompute_kwargs\u001b[49m\n\u001b[1;32m 842\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 844\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Scalar(graph, meta_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/dask/base.py:317\u001b[0m, in \u001b[0;36mcompute_as_if_collection\u001b[0;34m(cls, dsk, keys, scheduler, get, **kwargs)\u001b[0m\n\u001b[1;32m 315\u001b[0m schedule \u001b[38;5;241m=\u001b[39m get_scheduler(scheduler\u001b[38;5;241m=\u001b[39mscheduler, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mcls\u001b[39m, get\u001b[38;5;241m=\u001b[39mget)\n\u001b[1;32m 316\u001b[0m dsk2 \u001b[38;5;241m=\u001b[39m optimization_function(\u001b[38;5;28mcls\u001b[39m)(dsk, keys, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 317\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdsk2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:3010\u001b[0m, in \u001b[0;36mClient.get\u001b[0;34m(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[1;32m 3008\u001b[0m should_rejoin \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 3009\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3010\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgather\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacked\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdirect\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3011\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 3012\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m futures\u001b[38;5;241m.\u001b[39mvalues():\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:2162\u001b[0m, in \u001b[0;36mClient.gather\u001b[0;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[1;32m 2160\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2161\u001b[0m local_worker \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 2162\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2163\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gather\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2164\u001b[0m \u001b[43m \u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2165\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2166\u001b[0m \u001b[43m \u001b[49m\u001b[43mdirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2167\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_worker\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_worker\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2168\u001b[0m \u001b[43m \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2169\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/utils.py:311\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m future\n\u001b[1;32m 310\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 311\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 312\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 313\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/utils.py:378\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error:\n\u001b[1;32m 377\u001b[0m typ, exc, tb \u001b[38;5;241m=\u001b[39m error\n\u001b[0;32m--> 378\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mwith_traceback(tb)\n\u001b[1;32m 379\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 380\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/utils.py:351\u001b[0m, in \u001b[0;36msync.<locals>.f\u001b[0;34m()\u001b[0m\n\u001b[1;32m 349\u001b[0m future \u001b[38;5;241m=\u001b[39m asyncio\u001b[38;5;241m.\u001b[39mwait_for(future, callback_timeout)\n\u001b[1;32m 350\u001b[0m future \u001b[38;5;241m=\u001b[39m asyncio\u001b[38;5;241m.\u001b[39mensure_future(future)\n\u001b[0;32m--> 351\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m future\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 353\u001b[0m error \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/tornado/gen.py:762\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 759\u001b[0m exc_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 761\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 762\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 763\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 764\u001b[0m exc_info \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()\n", | |
"File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:2025\u001b[0m, in \u001b[0;36mClient._gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m 2023\u001b[0m exc \u001b[38;5;241m=\u001b[39m CancelledError(key)\n\u001b[1;32m 2024\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2025\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception\u001b[38;5;241m.\u001b[39mwith_traceback(traceback)\n\u001b[1;32m 2026\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[1;32m 2027\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", | |
"\u001b[0;31mKilledWorker\u001b[0m: (\"('metadata-to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2', 0)\", <WorkerState 'tls://10.4.3.14:41219', name: parquet-2022-2-1-worker-126f060ae1, status: closed, memory: 0, processing: 1>)" | |
] | |
} | |
], | |
"source": [ | |
"with timer(\"Naive write 200 GB\"):\n", | |
" ddf = timeseries(\n", | |
" dtypes={\n", | |
" **{f\"name-{i}\": str for i in range(25)},\n", | |
" **{f\"price-{i}\": float for i in range(25)},\n", | |
" **{f\"id-{i}\": int for i in range(25)},\n", | |
" **{f\"cat-{i}\": \"category\" for i in range(25)},\n", | |
" },\n", | |
" start=\"2021-01-01\",\n", | |
" end=\"2021-02-01\",\n", | |
" freq=\"10ms\",\n", | |
" partition_freq=\"1H\",\n", | |
" )\n", | |
" ddf.to_parquet(\"s3://dask-io/parquet-performance-200GB/\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "441d824c-399e-4ac0-85de-9cd98ee643d6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"### Task Group timing" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>prefix</th>\n", | |
" <th>duration</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>group</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2</th>\n", | |
" <td>to-parquet</td>\n", | |
" <td>37270.498014</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" prefix duration\n", | |
"group \n", | |
"to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2 to-parquet 37270.498014" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Cleanup\n", | |
"try:\n", | |
" fs = s3fs.S3FileSystem()\n", | |
" fs.rm(\"s3://dask-io/parquet-performance-200GB/\", recursive=True)\n", | |
"except:\n", | |
" pass\n", | |
"\n", | |
"display(Markdown(\"### Task Group timing\"))\n", | |
"get_tg_data(client)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "7af0dc12-c01d-4b71-a8a4-052d18cb3fe4", | |
"metadata": {}, | |
"source": [ | |
"## Test #3: writing terabyte scale data\n", | |
"\n", | |
"Let's go bigger, writing ~2TB of data with ~800 partitions, this time specifying `write_metadata_file=False`, which is the default in newer versions." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "d36a1ca5-c1b8-4e9a-87ea-f382b78ee7a5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"**Writing 2TB of data** took **937.07** seconds" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"client.restart()\n", | |
"\n", | |
"ddf = timeseries(\n", | |
" dtypes={\n", | |
" **{f\"name-{i}\": str for i in range(25)},\n", | |
" **{f\"price-{i}\": float for i in range(25)},\n", | |
" **{f\"id-{i}\": int for i in range(25)},\n", | |
" **{f\"cat-{i}\": \"category\" for i in range(25)},\n", | |
" },\n", | |
" start=\"2021-01-01\",\n", | |
" end=\"2022-01-01\",\n", | |
" freq=\"10ms\",\n", | |
" partition_freq=\"1H\",\n", | |
")\n", | |
"\n", | |
"with timer(\"Writing 2TB of data\"):\n", | |
" s = ddf.to_parquet(\n", | |
" \"s3://dask-io/parquet-performance-2TB/\",\n", | |
" engine=\"pyarrow\",\n", | |
" write_metadata_file=False,\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "7febef9b-e228-41aa-a034-8f065af32942", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"### Task Group timing" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>prefix</th>\n", | |
" <th>duration</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>group</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>to-parquet-478097459de7526b73d5d29490a0c6f0</th>\n", | |
" <td>to-parquet</td>\n", | |
" <td>181527.293594</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>metadata-to-parquet-478097459de7526b73d5d29490a0c6f0</th>\n", | |
" <td>metadata-to-parquet</td>\n", | |
" <td>3.640549</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" prefix \\\n", | |
"group \n", | |
"to-parquet-478097459de7526b73d5d29490a0c6f0 to-parquet \n", | |
"metadata-to-parquet-478097459de7526b73d5d29490a... metadata-to-parquet \n", | |
"\n", | |
" duration \n", | |
"group \n", | |
"to-parquet-478097459de7526b73d5d29490a0c6f0 181527.293594 \n", | |
"metadata-to-parquet-478097459de7526b73d5d29490a... 3.640549 " | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"display(Markdown(\"### Task Group timing\"))\n", | |
"get_tg_data(client)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9148e115-0b4b-4553-ae2c-e0e7cc5f7d5d", | |
"metadata": {}, | |
"source": [ | |
"## Test #4, an ETL workflow on our previous data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "088541d7-7837-41f9-ab16-29d0d7f95320", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def transform(df):\n", | |
" \"\"\"\n", | |
" T the data!\n", | |
" \"\"\"\n", | |
" transform = {}\n", | |
" for c in df.columns:\n", | |
" dtype = str(df[c].dtype)\n", | |
" if dtype == \"object\":\n", | |
" transform[c] = df[c].str.upper()\n", | |
" elif dtype == \"int64\":\n", | |
" transform[c] = df[c] * 2\n", | |
" elif dtype == \"float64\":\n", | |
" transform[c] = df[c] / 2\n", | |
" return df.assign(**transform)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "235263ec-d809-4154-ac7f-be8fbbf4255a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"**ETL 2TB** took **1026.25** seconds" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"client.restart()\n", | |
"\n", | |
"with timer(\"ETL 2TB\"):\n", | |
" ddf2 = dd.read_parquet(\"s3://dask-io/parquet-performance-2TB/\", engine=\"pyarrow\")\n", | |
" # This transform does pretty badly! Probably GIL related, and may be fixed\n", | |
" # by pyarrow strings.\n", | |
" #ddf3 = ddf2.map_partitions(transform, meta=ddf2._meta)\n", | |
" ddf3 = ddf2.assign(date=pandas.Timestamp.now())\n", | |
" ddf3.to_parquet(\n", | |
" \"s3://dask-io/parquet-performance-2TB-transform/\",\n", | |
" engine=\"pyarrow\",\n", | |
" write_metadata_file=False\n", | |
" )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "3c841703-f101-4cbf-b694-2c259aded2bf", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"### Task Group timing" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Markdown object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>prefix</th>\n", | |
" <th>duration</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>group</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>to-parquet-95473033641b0cab61c08dcb81033d9e</th>\n", | |
" <td>to-parquet</td>\n", | |
" <td>199761.530804</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>metadata-to-parquet-95473033641b0cab61c08dcb81033d9e</th>\n", | |
" <td>metadata-to-parquet</td>\n", | |
" <td>3.647704</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" prefix \\\n", | |
"group \n", | |
"to-parquet-95473033641b0cab61c08dcb81033d9e to-parquet \n", | |
"metadata-to-parquet-95473033641b0cab61c08dcb810... metadata-to-parquet \n", | |
"\n", | |
" duration \n", | |
"group \n", | |
"to-parquet-95473033641b0cab61c08dcb81033d9e 199761.530804 \n", | |
"metadata-to-parquet-95473033641b0cab61c08dcb810... 3.647704 " | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"display(Markdown(\"### Task Group timing\"))\n", | |
"get_tg_data(client)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "011c2e61-d6b2-48f9-9559-075fcdfde653", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Cleanup\n", | |
"try:\n", | |
" fs = s3fs.S3FileSystem()\n", | |
" fs.rm(\"s3://dask-io/parquet-performance-2TB/\", recursive=True)\n", | |
" fs.rm(\"s3://dask-io/parquet-performance-2TB-transform/\", recursive=True)\n", | |
"except:\n", | |
" pass\n", | |
"\n", | |
"client.close()\n", | |
"cluster.close()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment