ian-r-rose · May 17, 2022 15:03
diff --git a/parquet-performance-2022.2.1.ipynb b/parquet-performance-2022.2.1.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5ca5061d-4d62-4212-8d33-efd64655c3a0",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Parquet Performance Comparisons\n",
    "\n",
    "In March 2022 we kicked off an effort to improve the parquet user expreience in Dask.\n",
    "This notebook is intended to measure how we did, both in terms of improvements to default parameters and in performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2af778b2-2a1a-4d8f-baae-56d2d54640d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2022.02.1'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import contextlib\n",
    "import time\n",
    "\n",
    "import coiled\n",
    "import dask\n",
    "import dask.dataframe as dd\n",
    "import distributed\n",
    "import pandas\n",
    "import s3fs\n",
    "from dask.datasets import timeseries\n",
    "\n",
    "dask.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b474bd73-f8e9-4d16-a99f-010894a2e630",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Utilities and setup\n",
    "\n",
    "Here we create two utilities: one for getting timing information of task groups from the scheduler (roughly speaking, thread-seconds), and one for measuring the wall clock time from the client."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b89843d6-fb86-4e09-b2fa-71b407b0dbd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from distributed.diagnostics import SchedulerPlugin\n",
    "from distributed.utils import key_split, key_split_group\n",
    "\n",
    "class TaskGroupStatistics(SchedulerPlugin):\n",
    "    \"\"\"\n",
    "    A plugin for collecting task group timing information\n",
    "    from the scheduler.\n",
    "    \"\"\"\n",
    "    def __init__(self):\n",
    "        \"\"\"Initialize the plugin\"\"\"\n",
    "        self.groups = {}\n",
    "        self.scheduler = None\n",
    "\n",
    "    def start(self, scheduler):\n",
    "        \"\"\"Called on scheduler start as well as on registration time\"\"\"\n",
    "        self.scheduler = scheduler\n",
    "        scheduler.handlers[\"get_task_groups\"] = self.get_task_groups\n",
    "\n",
    "    def transition(self, key, start, finish, *args, **kwargs):\n",
    "        \"\"\"On key transition to memory, update the task group data\"\"\"\n",
    "        if self.scheduler is None:\n",
    "            # Should not get here if initialization has happened correctly\n",
    "            return\n",
    "\n",
    "        if start == \"processing\" and finish == \"memory\":\n",
    "            prefix_name = key_split(key)\n",
    "            group_name = key_split_group(key)\n",
    "\n",
    "            if group_name not in self.groups:\n",
    "                self.groups[group_name] = {}\n",
    "\n",
    "            group = self.scheduler.task_groups[group_name]\n",
    "            self.groups[group_name][\"prefix\"] = prefix_name\n",
    "            self.groups[group_name][\"duration\"] = group.duration\n",
    "            self.groups[group_name][\"start\"] = str(\n",
    "                datetime.datetime.fromtimestamp(group.start)\n",
    "            )\n",
    "            self.groups[group_name][\"stop\"] = str(\n",
    "                datetime.datetime.fromtimestamp(group.stop)\n",
    "            )\n",
    "            self.groups[group_name][\"nbytes\"] = group.nbytes_total\n",
    "\n",
    "    async def get_task_groups(self, comm):\n",
    "        return self.groups\n",
    "\n",
    "    def restart(self, scheduler):\n",
    "        self.groups = {}\n",
    "\n",
    "def get_tg_data(client):\n",
    "    tg_data = client.sync(client.scheduler.get_task_groups)\n",
    "\n",
    "    df = pandas.DataFrame.from_dict(tg_data, orient=\"index\")\n",
    "    df.index.name = \"group\"\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "780290e6-2626-47b5-aed3-7aca17581fe6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Markdown\n",
    "\n",
    "@contextlib.contextmanager\n",
    "def timer(label=\"Block\"):\n",
    "    \"\"\"\n",
    "    Time a block of code and print out the result when done.\n",
    "    \"\"\"\n",
    "    start = time.time()\n",
    "    yield\n",
    "    end = time.time()\n",
    "    display(Markdown(f\"**{label}** took **{end-start:.2f}** seconds\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82c7e406-dd6a-4142-a2fa-ba843b3799a5",
   "metadata": {},
   "source": [
    "## Software environments\n",
    "\n",
    "Create two software environments, one from February, and one from May."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f5075232-55cb-44a9-a74f-bff37eaff538",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found existing software environment build, returning\n",
      "Found existing software environment build, returning\n"
     ]
    }
   ],
   "source": [
    "old=\"2022.2.1\"\n",
    "new=\"2022.5.0\"\n",
    "\n",
    "coiled.create_software_environment(\n",
    "    f\"parquet-{old.replace('.', '-')}\",\n",
    "    conda={\n",
    "        \"channels\": [\"conda-forge\"],\n",
    "        \"dependencies\": [\n",
    "            \"python=3.9\",\n",
    "            f\"dask={old}\",\n",
    "            f\"distributed={old}\",\n",
    "            \"s3fs\",\n",
    "            \"pyarrow=7\",\n",
    "            \"fastparquet=0.8.0\",\n",
    "        ],\n",
    "    }\n",
    ")\n",
    "\n",
    "coiled.create_software_environment(\n",
    "    f\"parquet-{new.replace('.', '-')}\",\n",
    "    conda={\n",
    "        \"channels\": [\"conda-forge\"],\n",
    "        \"dependencies\": [\n",
    "            \"python=3.9\",\n",
    "            f\"dask=={new}\",\n",
    "            f\"distributed=={new}\",\n",
    "            \"s3fs\",\n",
    "            \"pyarrow=7\",\n",
    "            \"fastparquet=0.8.1\",\n",
    "        ],\n",
    "    },\n",
    ")\n",
    "kind = old"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0be911c1-4ad3-4ecd-920f-d83eb3715d84",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Test #1: naive data read\n",
    "\n",
    "Let's read a single year of NYC taxi parquet data from Ursa labs with no changes to default parameters (except to specify that the engine is `pyarrow` so we can make apples-to-apples comparisons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a07a30f6-0c8a-4e96-9f49-125f31bd8423",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f139c042e490473296d08f5921da2f85",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ian/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:1278: VersionMismatchWarning: Mismatched versions found\n",
      "\n",
      "+---------+--------+-----------+---------+\n",
      "| Package | client | scheduler | workers |\n",
      "+---------+--------+-----------+---------+\n",
      "| lz4     | 4.0.0  | None      | None    |\n",
      "+---------+--------+-----------+---------+\n",
      "  warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "**Naive read** took **136.77** seconds"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "### Task Group timing"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prefix</th>\n",
       "      <th>duration</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>read-parquet-31043b51c86561bc40fb6007a1fdc33a</th>\n",
       "      <td>read-parquet</td>\n",
       "      <td>1426.325892</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     prefix     duration\n",
       "group                                                                   \n",
       "read-parquet-31043b51c86561bc40fb6007a1fdc33a  read-parquet  1426.325892"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Create a separate cluster for ursa data in a the same region\n",
    "cluster = coiled.Cluster(\n",
    "    name=f\"parquet-{kind.replace('.', '-')}\",\n",
    "    software=f\"parquet-{kind.replace('.', '-')}\",\n",
    "    n_workers=12,\n",
    "    worker_vm_types=[\"t3.xlarge\"],\n",
    "    scheduler_vm_types=[\"t3.large\"],\n",
    "    backend_options={\"region\": \"us-east-2\"},\n",
    ")\n",
    "    \n",
    "client = distributed.Client(cluster)\n",
    "client.register_scheduler_plugin(TaskGroupStatistics())\n",
    "\n",
    "# Note: reads from `s3://ursa-labs-taxi-data` seem to be particularly slow!\n",
    "# Other buckets don't seem to have that feature (including ones with what are\n",
    "# notionally the same data). I don't understand why right now.\n",
    "with timer(\"Naive read\"):\n",
    "    ddf = dd.read_parquet(\"s3://ursa-labs-taxi-data/2012/**.parquet\", engine=\"pyarrow\")\n",
    "\n",
    "    ddf = ddf.persist()\n",
    "    distributed.wait(ddf)\n",
    "\n",
    "display(Markdown(\"### Task Group timing\"))\n",
    "display(get_tg_data(client))\n",
    "\n",
    "client.close()\n",
    "cluster.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "951d679f-0b7c-411a-ba3e-34159cff11ec",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Create a new shared cluster\n",
    "\n",
    "We create it in `us-east-1` to have fast communication with our test s3 bucket `s3://dask-io`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c1519363-da48-48ba-aba6-197a8ec0b00b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "Running with **Dask==2022.2.1**"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "050e488c039e4800b9994841d9a060ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
      ],
      "text/plain": []
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ian/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:1278: VersionMismatchWarning: Mismatched versions found\n",
      "\n",
      "+---------+--------+-----------+---------+\n",
      "| Package | client | scheduler | workers |\n",
      "+---------+--------+-----------+---------+\n",
      "| lz4     | 4.0.0  | None      | None    |\n",
      "+---------+--------+-----------+---------+\n",
      "  warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n"
     ]
    }
   ],
   "source": [
    "display(Markdown(f\"Running with **Dask=={kind}**\"))\n",
    "\n",
    "cluster = coiled.Cluster(\n",
    "    name=f\"parquet-{kind.replace('.', '-')}\",\n",
    "    software=f\"parquet-{kind.replace('.', '-')}\",\n",
    "    n_workers=25,\n",
    "    worker_vm_types=[\"m5.2xlarge\"],\n",
    "    scheduler_vm_types=[\"t3.large\"],\n",
    "    backend_options={\"region\": \"us-east-1\"},\n",
    ")\n",
    "    \n",
    "client = distributed.Client(cluster)\n",
    "client.register_scheduler_plugin(TaskGroupStatistics())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "560617ac-ed8d-4fa6-88d4-7ff8b213d4b9",
   "metadata": {},
   "source": [
    "## Test #2: naive data write\n",
    "\n",
    "Let's write a ~200 GB, 700 partition dataset with 100 columns. In older Dask versions the metadata writing will knock down workers, resulting in a frustrating failure at the end of the computation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c825ae34-7a90-473f-b3f6-81defaf2192c",
   "metadata": {},
   "outputs": [
    {
     "ename": "KilledWorker",
     "evalue": "(\"('metadata-to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2', 0)\", <WorkerState 'tls://10.4.3.14:41219', name: parquet-2022-2-1-worker-126f060ae1, status: closed, memory: 0, processing: 1>)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKilledWorker\u001b[0m                              Traceback (most recent call last)",
      "Input \u001b[0;32mIn [7]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m timer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNaive write 200 GB\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m      2\u001b[0m     ddf \u001b[38;5;241m=\u001b[39m timeseries(\n\u001b[1;32m      3\u001b[0m         dtypes\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m      4\u001b[0m             \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m{\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m25\u001b[39m)},\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     12\u001b[0m         partition_freq\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1H\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     13\u001b[0m     )\n\u001b[0;32m---> 14\u001b[0m     \u001b[43mddf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43ms3://dask-io/parquet-performance-200GB/\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/dask/dataframe/core.py:4825\u001b[0m, in \u001b[0;36mDataFrame.to_parquet\u001b[0;34m(self, path, *args, **kwargs)\u001b[0m\n\u001b[1;32m   4822\u001b[0m \u001b[38;5;124;03m\"\"\"See dd.to_parquet docstring for more information\"\"\"\u001b[39;00m\n\u001b[1;32m   4823\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m to_parquet\n\u001b[0;32m-> 4825\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mto_parquet\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:840\u001b[0m, in \u001b[0;36mto_parquet\u001b[0;34m(df, path, engine, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, name_function, **kwargs)\u001b[0m\n\u001b[1;32m    838\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(meta_name, dsk, dependencies\u001b[38;5;241m=\u001b[39m(data_write,))\n\u001b[1;32m    839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m compute:\n\u001b[0;32m--> 840\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcompute_as_if_collection\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    841\u001b[0m \u001b[43m        \u001b[49m\u001b[43mScalar\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgraph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmeta_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcompute_kwargs\u001b[49m\n\u001b[1;32m    842\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    843\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    844\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m Scalar(graph, meta_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/dask/base.py:317\u001b[0m, in \u001b[0;36mcompute_as_if_collection\u001b[0;34m(cls, dsk, keys, scheduler, get, **kwargs)\u001b[0m\n\u001b[1;32m    315\u001b[0m schedule \u001b[38;5;241m=\u001b[39m get_scheduler(scheduler\u001b[38;5;241m=\u001b[39mscheduler, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mcls\u001b[39m, get\u001b[38;5;241m=\u001b[39mget)\n\u001b[1;32m    316\u001b[0m dsk2 \u001b[38;5;241m=\u001b[39m optimization_function(\u001b[38;5;28mcls\u001b[39m)(dsk, keys, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 317\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdsk2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:3010\u001b[0m, in \u001b[0;36mClient.get\u001b[0;34m(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[1;32m   3008\u001b[0m         should_rejoin \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m   3009\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3010\u001b[0m     results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgather\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacked\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdirect\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3011\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m   3012\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m futures\u001b[38;5;241m.\u001b[39mvalues():\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:2162\u001b[0m, in \u001b[0;36mClient.gather\u001b[0;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[1;32m   2160\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2161\u001b[0m     local_worker \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 2162\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2163\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gather\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2164\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2165\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2166\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdirect\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2167\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlocal_worker\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_worker\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2168\u001b[0m \u001b[43m    \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2169\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/utils.py:311\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    309\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m future\n\u001b[1;32m    310\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 311\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    312\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    313\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/utils.py:378\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    376\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error:\n\u001b[1;32m    377\u001b[0m     typ, exc, tb \u001b[38;5;241m=\u001b[39m error\n\u001b[0;32m--> 378\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m exc\u001b[38;5;241m.\u001b[39mwith_traceback(tb)\n\u001b[1;32m    379\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    380\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/utils.py:351\u001b[0m, in \u001b[0;36msync.<locals>.f\u001b[0;34m()\u001b[0m\n\u001b[1;32m    349\u001b[0m         future \u001b[38;5;241m=\u001b[39m asyncio\u001b[38;5;241m.\u001b[39mwait_for(future, callback_timeout)\n\u001b[1;32m    350\u001b[0m     future \u001b[38;5;241m=\u001b[39m asyncio\u001b[38;5;241m.\u001b[39mensure_future(future)\n\u001b[0;32m--> 351\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m future\n\u001b[1;32m    352\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m    353\u001b[0m     error \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/tornado/gen.py:762\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    759\u001b[0m exc_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    761\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 762\u001b[0m     value \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    763\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m    764\u001b[0m     exc_info \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()\n",
      "File \u001b[0;32m~/miniconda3/envs/parquet/lib/python3.9/site-packages/distributed/client.py:2025\u001b[0m, in \u001b[0;36mClient._gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m   2023\u001b[0m         exc \u001b[38;5;241m=\u001b[39m CancelledError(key)\n\u001b[1;32m   2024\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2025\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m exception\u001b[38;5;241m.\u001b[39mwith_traceback(traceback)\n\u001b[1;32m   2026\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[1;32m   2027\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mskip\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
      "\u001b[0;31mKilledWorker\u001b[0m: (\"('metadata-to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2', 0)\", <WorkerState 'tls://10.4.3.14:41219', name: parquet-2022-2-1-worker-126f060ae1, status: closed, memory: 0, processing: 1>)"
     ]
    }
   ],
   "source": [
    "with timer(\"Naive write 200 GB\"):\n",
    "    ddf = timeseries(\n",
    "        dtypes={\n",
    "            **{f\"name-{i}\": str for i in range(25)},\n",
    "            **{f\"price-{i}\": float for i in range(25)},\n",
    "            **{f\"id-{i}\": int for i in range(25)},\n",
    "            **{f\"cat-{i}\": \"category\" for i in range(25)},\n",
    "        },\n",
    "        start=\"2021-01-01\",\n",
    "        end=\"2021-02-01\",\n",
    "        freq=\"10ms\",\n",
    "        partition_freq=\"1H\",\n",
    "    )\n",
    "    ddf.to_parquet(\"s3://dask-io/parquet-performance-200GB/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "441d824c-399e-4ac0-85de-9cd98ee643d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### Task Group timing"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prefix</th>\n",
       "      <th>duration</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2</th>\n",
       "      <td>to-parquet</td>\n",
       "      <td>37270.498014</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 prefix      duration\n",
       "group                                                                \n",
       "to-parquet-69009ad3ec5fb7b2e237fcddd27d22e2  to-parquet  37270.498014"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cleanup\n",
    "try:\n",
    "    fs = s3fs.S3FileSystem()\n",
    "    fs.rm(\"s3://dask-io/parquet-performance-200GB/\", recursive=True)\n",
    "except:\n",
    "    pass\n",
    "\n",
    "display(Markdown(\"### Task Group timing\"))\n",
    "get_tg_data(client)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7af0dc12-c01d-4b71-a8a4-052d18cb3fe4",
   "metadata": {},
   "source": [
    "## Test #3: writing terabyte scale data\n",
    "\n",
    "Let's go bigger, writing ~2TB of data with ~800 partitions, this time specifying `write_metadata_file=False`, which is the default in newer versions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "d36a1ca5-c1b8-4e9a-87ea-f382b78ee7a5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**Writing 2TB of data** took **937.07** seconds"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "client.restart()\n",
    "\n",
    "ddf = timeseries(\n",
    "    dtypes={\n",
    "        **{f\"name-{i}\": str for i in range(25)},\n",
    "        **{f\"price-{i}\": float for i in range(25)},\n",
    "        **{f\"id-{i}\": int for i in range(25)},\n",
    "        **{f\"cat-{i}\": \"category\" for i in range(25)},\n",
    "    },\n",
    "    start=\"2021-01-01\",\n",
    "    end=\"2022-01-01\",\n",
    "    freq=\"10ms\",\n",
    "    partition_freq=\"1H\",\n",
    ")\n",
    "\n",
    "with timer(\"Writing 2TB of data\"):\n",
    "    s = ddf.to_parquet(\n",
    "        \"s3://dask-io/parquet-performance-2TB/\",\n",
    "        engine=\"pyarrow\",\n",
    "        write_metadata_file=False,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7febef9b-e228-41aa-a034-8f065af32942",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### Task Group timing"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prefix</th>\n",
       "      <th>duration</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>to-parquet-478097459de7526b73d5d29490a0c6f0</th>\n",
       "      <td>to-parquet</td>\n",
       "      <td>181527.293594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>metadata-to-parquet-478097459de7526b73d5d29490a0c6f0</th>\n",
       "      <td>metadata-to-parquet</td>\n",
       "      <td>3.640549</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                 prefix  \\\n",
       "group                                                                     \n",
       "to-parquet-478097459de7526b73d5d29490a0c6f0                  to-parquet   \n",
       "metadata-to-parquet-478097459de7526b73d5d29490a...  metadata-to-parquet   \n",
       "\n",
       "                                                         duration  \n",
       "group                                                              \n",
       "to-parquet-478097459de7526b73d5d29490a0c6f0         181527.293594  \n",
       "metadata-to-parquet-478097459de7526b73d5d29490a...       3.640549  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "display(Markdown(\"### Task Group timing\"))\n",
    "get_tg_data(client)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9148e115-0b4b-4553-ae2c-e0e7cc5f7d5d",
   "metadata": {},
   "source": [
    "## Test #4, an ETL workflow on our previous data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "088541d7-7837-41f9-ab16-29d0d7f95320",
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform(df):\n",
    "    \"\"\"\n",
    "    T the data!\n",
    "    \"\"\"\n",
    "    transform = {}\n",
    "    for c in df.columns:\n",
    "        dtype = str(df[c].dtype)\n",
    "        if dtype == \"object\":\n",
    "            transform[c] = df[c].str.upper()\n",
    "        elif dtype == \"int64\":\n",
    "            transform[c] = df[c] * 2\n",
    "        elif dtype == \"float64\":\n",
    "            transform[c] = df[c] / 2\n",
    "    return df.assign(**transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "235263ec-d809-4154-ac7f-be8fbbf4255a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "**ETL 2TB** took **1026.25** seconds"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "client.restart()\n",
    "\n",
    "with timer(\"ETL 2TB\"):\n",
    "    ddf2 = dd.read_parquet(\"s3://dask-io/parquet-performance-2TB/\", engine=\"pyarrow\")\n",
    "    # This transform does pretty badly! Probably GIL related, and may be fixed\n",
    "    # by pyarrow strings.\n",
    "    #ddf3 = ddf2.map_partitions(transform, meta=ddf2._meta)\n",
    "    ddf3 = ddf2.assign(date=pandas.Timestamp.now())\n",
    "    ddf3.to_parquet(\n",
    "        \"s3://dask-io/parquet-performance-2TB-transform/\",\n",
    "        engine=\"pyarrow\",\n",
    "        write_metadata_file=False\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3c841703-f101-4cbf-b694-2c259aded2bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "### Task Group timing"
      ],
      "text/plain": [
       "<IPython.core.display.Markdown object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prefix</th>\n",
       "      <th>duration</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>group</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>to-parquet-95473033641b0cab61c08dcb81033d9e</th>\n",
       "      <td>to-parquet</td>\n",
       "      <td>199761.530804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>metadata-to-parquet-95473033641b0cab61c08dcb81033d9e</th>\n",
       "      <td>metadata-to-parquet</td>\n",
       "      <td>3.647704</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                 prefix  \\\n",
       "group                                                                     \n",
       "to-parquet-95473033641b0cab61c08dcb81033d9e                  to-parquet   \n",
       "metadata-to-parquet-95473033641b0cab61c08dcb810...  metadata-to-parquet   \n",
       "\n",
       "                                                         duration  \n",
       "group                                                              \n",
       "to-parquet-95473033641b0cab61c08dcb81033d9e         199761.530804  \n",
       "metadata-to-parquet-95473033641b0cab61c08dcb810...       3.647704  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "display(Markdown(\"### Task Group timing\"))\n",
    "get_tg_data(client)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "011c2e61-d6b2-48f9-9559-075fcdfde653",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cleanup\n",
    "try:\n",
    "    fs = s3fs.S3FileSystem()\n",
    "    fs.rm(\"s3://dask-io/parquet-performance-2TB/\", recursive=True)\n",
    "    fs.rm(\"s3://dask-io/parquet-performance-2TB-transform/\", recursive=True)\n",
    "except:\n",
    "    pass\n",
    "\n",
    "client.close()\n",
    "cluster.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }