pmav99 · September 8, 2023 08:18
diff --git a/append_to_parquet.ipynb b/append_to_parquet.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c8a4fdc4-18b1-445c-979d-ea4c65e9c18e",
   "metadata": {},
   "source": [
    "## Appending to parquet files\n",
    "\n",
    "Requires `engine == \"fastparquet\"`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3aed73de-de12-407c-9837-9ac3996bf722",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.141307Z",
     "iopub.status.busy": "2023-09-08T08:18:10.141091Z",
     "iopub.status.idle": "2023-09-08T08:18:10.404449Z",
     "shell.execute_reply": "2023-09-08T08:18:10.403983Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.141284Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import fastparquet"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b44c87d-66be-4c21-b9e7-dadf5cefd1fc",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T13:20:28.580984Z",
     "iopub.status.busy": "2023-09-07T13:20:28.580723Z",
     "iopub.status.idle": "2023-09-07T13:20:28.840869Z",
     "shell.execute_reply": "2023-09-07T13:20:28.840281Z",
     "shell.execute_reply.started": "2023-09-07T13:20:28.580965Z"
    },
    "tags": []
   },
   "source": [
    "Create data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "17c1f106-9299-4b55-b316-eac12d07c915",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.405753Z",
     "iopub.status.busy": "2023-09-08T08:18:10.405523Z",
     "iopub.status.idle": "2023-09-08T08:18:10.410473Z",
     "shell.execute_reply": "2023-09-08T08:18:10.410019Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.405736Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "filename = \"/tmp/df.parquet\"\n",
    "\n",
    "data = [1, 2, 3, 4, 5, 4, 4, 3, 2, 4]\n",
    "index1 = pd.date_range(\"2001-01-01T00:00:00\", \"2001-01-01T00:09:00\", freq=\"min\")\n",
    "index2 = pd.date_range(\"2001-01-02T00:00:00\", \"2001-01-02T00:09:00\", freq=\"min\")\n",
    "\n",
    "initial_df = pd.DataFrame(\n",
    "    data={\"rad\": data},\n",
    "    index=index1,\n",
    "    dtype=float,\n",
    ")\n",
    "initial_df.index.name = \"time\"\n",
    "initial_df.attrs.update(dict(a=1, b=2, c=3))\n",
    "\n",
    "to_be_appended_df = pd.DataFrame(\n",
    "    data={\"rad\": data},\n",
    "    index=index2,\n",
    "    dtype=int,\n",
    ")\n",
    "to_be_appended_df.index.name = \"time\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4fd8d33-d064-4b90-90b8-e6b4bb3ba5cd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T13:34:33.464552Z",
     "iopub.status.busy": "2023-09-07T13:34:33.464279Z",
     "iopub.status.idle": "2023-09-07T13:34:33.699552Z",
     "shell.execute_reply": "2023-09-07T13:34:33.698907Z",
     "shell.execute_reply.started": "2023-09-07T13:34:33.464530Z"
    },
    "tags": []
   },
   "source": [
    "Write initial file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f31da3ab-356e-4b84-bbf8-a633fa0eb2bd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.411171Z",
     "iopub.status.busy": "2023-09-08T08:18:10.410998Z",
     "iopub.status.idle": "2023-09-08T08:18:10.647088Z",
     "shell.execute_reply": "2023-09-08T08:18:10.646477Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.411155Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 panos panos 949 Sep  8 11:18 /tmp/df.parquet\n"
     ]
    }
   ],
   "source": [
    "!rm -rf {filename}\n",
    "initial_df.to_parquet(filename, engine=\"fastparquet\")\n",
    "!ls -lah {filename}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d9866e0-d06b-4247-b93e-d33052067315",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T13:36:19.214367Z",
     "iopub.status.busy": "2023-09-07T13:36:19.214138Z",
     "iopub.status.idle": "2023-09-07T13:36:19.454709Z",
     "shell.execute_reply": "2023-09-07T13:36:19.453971Z",
     "shell.execute_reply.started": "2023-09-07T13:36:19.214347Z"
    },
    "tags": []
   },
   "source": [
    "Read data from disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "77ed6e89-916b-45a4-bf53-ccdb9f9d0dcf",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.648075Z",
     "iopub.status.busy": "2023-09-08T08:18:10.647910Z",
     "iopub.status.idle": "2023-09-08T08:18:10.744014Z",
     "shell.execute_reply": "2023-09-08T08:18:10.743527Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.648059Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "read_df = pd.read_parquet(filename)\n",
    "assert initial_df.equals(read_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "681207dd-c146-44c1-b252-9ba4650b1980",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T13:44:43.906015Z",
     "iopub.status.busy": "2023-09-07T13:44:43.905870Z",
     "iopub.status.idle": "2023-09-07T13:44:44.139911Z",
     "shell.execute_reply": "2023-09-07T13:44:44.138945Z",
     "shell.execute_reply.started": "2023-09-07T13:44:43.905997Z"
    },
    "tags": []
   },
   "source": [
    "### Warning! - Attributes don't seem to be preserved"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "30f725db-e149-4c68-94f0-86c31bc3f8e8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.744632Z",
     "iopub.status.busy": "2023-09-08T08:18:10.744502Z",
     "iopub.status.idle": "2023-09-08T08:18:10.749195Z",
     "shell.execute_reply": "2023-09-08T08:18:10.748816Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.744610Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "read_df.attrs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2b16734-320a-4be6-bca9-acfb63646894",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T13:38:33.671948Z",
     "iopub.status.busy": "2023-09-07T13:38:33.671833Z",
     "iopub.status.idle": "2023-09-07T13:38:33.908914Z",
     "shell.execute_reply": "2023-09-07T13:38:33.908171Z",
     "shell.execute_reply.started": "2023-09-07T13:38:33.671936Z"
    },
    "tags": []
   },
   "source": [
    "### Append data to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9fa09491-c95d-4b5e-8f85-a3a2e745b767",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.749816Z",
     "iopub.status.busy": "2023-09-08T08:18:10.749690Z",
     "iopub.status.idle": "2023-09-08T08:18:10.991365Z",
     "shell.execute_reply": "2023-09-08T08:18:10.990017Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.749804Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 panos panos 949 Sep  8 11:18 /tmp/df.parquet\n",
      "-rw-r--r-- 1 panos panos 1.3K Sep  8 11:18 /tmp/df.parquet\n"
     ]
    }
   ],
   "source": [
    "!ls -lah {filename}\n",
    "to_be_appended_df.to_parquet(filename, engine=\"fastparquet\", append=True)\n",
    "!ls -lah {filename}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f988b009-9504-4a8e-803d-5a52a36e5a28",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:10.995429Z",
     "iopub.status.busy": "2023-09-08T08:18:10.995019Z",
     "iopub.status.idle": "2023-09-08T08:18:11.007706Z",
     "shell.execute_reply": "2023-09-08T08:18:11.006968Z",
     "shell.execute_reply.started": "2023-09-08T08:18:10.995395Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "read_appended_df = pd.read_parquet(filename)\n",
    "assert pd.concat([initial_df, to_be_appended_df]).equals(read_appended_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cf4af61-1e10-4f61-b068-807c278b501c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T13:39:16.153860Z",
     "iopub.status.busy": "2023-09-07T13:39:16.153627Z",
     "iopub.status.idle": "2023-09-07T13:39:16.161716Z",
     "shell.execute_reply": "2023-09-07T13:39:16.161183Z",
     "shell.execute_reply.started": "2023-09-07T13:39:16.153840Z"
    },
    "tags": []
   },
   "source": [
    "## Warning\n",
    "\n",
    "It is possible to append multiple times the same data, i.e. you can end up with duplicates in the index.\n",
    "\n",
    "It is up to us to ensure that we have a monotonic index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9e68162e-28a5-49e4-b75a-ed89ff3191a7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:11.009039Z",
     "iopub.status.busy": "2023-09-08T08:18:11.008763Z",
     "iopub.status.idle": "2023-09-08T08:18:11.738002Z",
     "shell.execute_reply": "2023-09-08T08:18:11.737137Z",
     "shell.execute_reply.started": "2023-09-08T08:18:11.009014Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 panos panos 1.3K Sep  8 11:18 /tmp/df.parquet\n",
      "-rw-r--r-- 1 panos panos 1.6K Sep  8 11:18 /tmp/df.parquet\n",
      "-rw-r--r-- 1 panos panos 1.9K Sep  8 11:18 /tmp/df.parquet\n",
      "-rw-r--r-- 1 panos panos 2.2K Sep  8 11:18 /tmp/df.parquet\n",
      "-rw-r--r-- 1 panos panos 2.5K Sep  8 11:18 /tmp/df.parquet\n",
      "-rw-r--r-- 1 panos panos 2.8K Sep  8 11:18 /tmp/df.parquet\n"
     ]
    }
   ],
   "source": [
    "for i in range(5):\n",
    "    !ls -lah {filename}\n",
    "    to_be_appended_df.to_parquet(filename, engine=\"fastparquet\", append=True)\n",
    "!ls -lah {filename}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5d39d7d-8454-42b4-88f6-0a49fcdcd6f8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T14:16:07.432437Z",
     "iopub.status.busy": "2023-09-07T14:16:07.432113Z",
     "iopub.status.idle": "2023-09-07T14:16:07.438124Z",
     "shell.execute_reply": "2023-09-07T14:16:07.437070Z",
     "shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
    },
    "tags": []
   },
   "source": [
    "### How parquet stores data to disk\n",
    "\n",
    "`parquet` stores data column-wise. The data of each column are stored in chunks named \"row-groups\".\n",
    "\n",
    "The default value for each \"row-group\" in `fastparquet` is 50_000_000, but this value can be changed,\n",
    "and, this value can be different for each \"row-group\".\n",
    "This can be useful if for example you want each \"row-group\" to contain the data of a whole year.\n",
    "\n",
    "###  Efficient reading of metadata\n",
    "\n",
    "Parquet files contain statistics for each \"row-group\". You can access these statistics like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ac54109e-f3fb-4f38-953a-3f77a39cd138",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:11.739389Z",
     "iopub.status.busy": "2023-09-08T08:18:11.739165Z",
     "iopub.status.idle": "2023-09-08T08:18:11.745377Z",
     "shell.execute_reply": "2023-09-08T08:18:11.744861Z",
     "shell.execute_reply.started": "2023-09-08T08:18:11.739367Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['min', 'max', 'null_count', 'distinct_count'])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pf = fastparquet.ParquetFile(filename)\n",
    "pf.statistics.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a742201-6d78-4fe7-8c8b-12146e6527c1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T14:16:07.432437Z",
     "iopub.status.busy": "2023-09-07T14:16:07.432113Z",
     "iopub.status.idle": "2023-09-07T14:16:07.438124Z",
     "shell.execute_reply": "2023-09-07T14:16:07.437070Z",
     "shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
    },
    "tags": []
   },
   "source": [
    "For example, the maximum value of the first two \"row-groups\" of column `rad` are:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fd7f0b30-d8f4-4e8c-b50b-f103d699281d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:11.746539Z",
     "iopub.status.busy": "2023-09-08T08:18:11.746102Z",
     "iopub.status.idle": "2023-09-08T08:18:11.758707Z",
     "shell.execute_reply": "2023-09-08T08:18:11.758066Z",
     "shell.execute_reply.started": "2023-09-08T08:18:11.746519Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pf.statistics[\"max\"][\"rad\"][0]\n",
    "pf.statistics[\"max\"][\"rad\"][1]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad546f83-35cc-461a-b572-2b0831ea6350",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T14:16:07.432437Z",
     "iopub.status.busy": "2023-09-07T14:16:07.432113Z",
     "iopub.status.idle": "2023-09-07T14:16:07.438124Z",
     "shell.execute_reply": "2023-09-07T14:16:07.437070Z",
     "shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
    },
    "tags": []
   },
   "source": [
    "The maximum rad value is:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ee9b7ace-9b81-481e-92e8-ad2c9a262a91",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:11.759638Z",
     "iopub.status.busy": "2023-09-08T08:18:11.759440Z",
     "iopub.status.idle": "2023-09-08T08:18:11.763719Z",
     "shell.execute_reply": "2023-09-08T08:18:11.763172Z",
     "shell.execute_reply.started": "2023-09-08T08:18:11.759618Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max(pf.statistics[\"max\"][\"rad\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d0828d2-27fc-44a1-b865-e542e55116dd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T14:16:07.432437Z",
     "iopub.status.busy": "2023-09-07T14:16:07.432113Z",
     "iopub.status.idle": "2023-09-07T14:16:07.438124Z",
     "shell.execute_reply": "2023-09-07T14:16:07.437070Z",
     "shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
    },
    "tags": []
   },
   "source": [
    "This can be very useful in our case because it allows us to get the most recent timestamp in a very efficient way (without reading/decompressing the full file from the disk):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5663631c-b2d0-48dc-9fe7-57e8706bf48a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:11.764608Z",
     "iopub.status.busy": "2023-09-08T08:18:11.764410Z",
     "iopub.status.idle": "2023-09-08T08:18:11.769240Z",
     "shell.execute_reply": "2023-09-08T08:18:11.768688Z",
     "shell.execute_reply.started": "2023-09-08T08:18:11.764579Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.datetime64('2001-01-02T00:09:00.000000000')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "most_recent_ts = pf.statistics[\"max\"][\"time\"][-1]\n",
    "most_recent_ts"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab4cdfc9-58a4-4dbf-8c41-f3c2ce462db3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-07T14:16:07.432437Z",
     "iopub.status.busy": "2023-09-07T14:16:07.432113Z",
     "iopub.status.idle": "2023-09-07T14:16:07.438124Z",
     "shell.execute_reply": "2023-09-07T14:16:07.437070Z",
     "shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
    },
    "tags": []
   },
   "source": [
    "## Efficient reading of \"row-groups\"\n",
    "\n",
    "If you know that you only need the data of a specific \"row-group\" then you can avoid reading/decompressing the whole file from disk:\n",
    "\n",
    "For that we need to ditch the `pandas` interface and directly use the `fastparquet` one, though:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "088651c4-cc38-45fc-a994-66699d0ebb6a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T08:18:11.770025Z",
     "iopub.status.busy": "2023-09-08T08:18:11.769846Z",
     "iopub.status.idle": "2023-09-08T08:18:11.780828Z",
     "shell.execute_reply": "2023-09-08T08:18:11.780252Z",
     "shell.execute_reply.started": "2023-09-08T08:18:11.770007Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(70, 10)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pf = fastparquet.ParquetFile(filename)\n",
    "\n",
    "full_df = pf.to_pandas()\n",
    "last_row_group_df = pf[-1].to_pandas()\n",
    "\n",
    "len(full_df), len(last_row_group_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c551352-d52a-4369-952c-7bc2b485adc7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-08T07:17:33.142315Z",
     "iopub.status.busy": "2023-09-08T07:17:33.142063Z",
     "iopub.status.idle": "2023-09-08T07:17:33.150592Z",
     "shell.execute_reply": "2023-09-08T07:17:33.150098Z",
     "shell.execute_reply.started": "2023-09-08T07:17:33.142297Z"
    },
    "tags": []
   },
   "source": [
    "Parquet supports additional ways that can speed up reading of files like filters etc, but I don't think that they are relevant in our case.\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "scraper",
   "language": "python",
   "name": "scraper"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "c8a4fdc4-18b1-445c-979d-ea4c65e9c18e",
	"metadata": {},
	"source": [
	"## Appending to parquet files\n",
	"\n",
	"Requires `engine == \"fastparquet\"`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "3aed73de-de12-407c-9837-9ac3996bf722",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.141307Z",
	"iopub.status.busy": "2023-09-08T08:18:10.141091Z",
	"iopub.status.idle": "2023-09-08T08:18:10.404449Z",
	"shell.execute_reply": "2023-09-08T08:18:10.403983Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.141284Z"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import fastparquet"
	]
	},
	{
	"cell_type": "markdown",
	"id": "7b44c87d-66be-4c21-b9e7-dadf5cefd1fc",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T13:20:28.580984Z",
	"iopub.status.busy": "2023-09-07T13:20:28.580723Z",
	"iopub.status.idle": "2023-09-07T13:20:28.840869Z",
	"shell.execute_reply": "2023-09-07T13:20:28.840281Z",
	"shell.execute_reply.started": "2023-09-07T13:20:28.580965Z"
	},
	"tags": []
	},
	"source": [
	"Create data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "17c1f106-9299-4b55-b316-eac12d07c915",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.405753Z",
	"iopub.status.busy": "2023-09-08T08:18:10.405523Z",
	"iopub.status.idle": "2023-09-08T08:18:10.410473Z",
	"shell.execute_reply": "2023-09-08T08:18:10.410019Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.405736Z"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"filename = \"/tmp/df.parquet\"\n",
	"\n",
	"data = [1, 2, 3, 4, 5, 4, 4, 3, 2, 4]\n",
	"index1 = pd.date_range(\"2001-01-01T00:00:00\", \"2001-01-01T00:09:00\", freq=\"min\")\n",
	"index2 = pd.date_range(\"2001-01-02T00:00:00\", \"2001-01-02T00:09:00\", freq=\"min\")\n",
	"\n",
	"initial_df = pd.DataFrame(\n",
	" data={\"rad\": data},\n",
	" index=index1,\n",
	" dtype=float,\n",
	")\n",
	"initial_df.index.name = \"time\"\n",
	"initial_df.attrs.update(dict(a=1, b=2, c=3))\n",
	"\n",
	"to_be_appended_df = pd.DataFrame(\n",
	" data={\"rad\": data},\n",
	" index=index2,\n",
	" dtype=int,\n",
	")\n",
	"to_be_appended_df.index.name = \"time\""
	]
	},
	{
	"cell_type": "markdown",
	"id": "f4fd8d33-d064-4b90-90b8-e6b4bb3ba5cd",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T13:34:33.464552Z",
	"iopub.status.busy": "2023-09-07T13:34:33.464279Z",
	"iopub.status.idle": "2023-09-07T13:34:33.699552Z",
	"shell.execute_reply": "2023-09-07T13:34:33.698907Z",
	"shell.execute_reply.started": "2023-09-07T13:34:33.464530Z"
	},
	"tags": []
	},
	"source": [
	"Write initial file"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "f31da3ab-356e-4b84-bbf8-a633fa0eb2bd",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.411171Z",
	"iopub.status.busy": "2023-09-08T08:18:10.410998Z",
	"iopub.status.idle": "2023-09-08T08:18:10.647088Z",
	"shell.execute_reply": "2023-09-08T08:18:10.646477Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.411155Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 panos panos 949 Sep 8 11:18 /tmp/df.parquet\n"
	]
	}
	],
	"source": [
	"!rm -rf {filename}\n",
	"initial_df.to_parquet(filename, engine=\"fastparquet\")\n",
	"!ls -lah {filename}"
	]
	},
	{
	"cell_type": "markdown",
	"id": "6d9866e0-d06b-4247-b93e-d33052067315",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T13:36:19.214367Z",
	"iopub.status.busy": "2023-09-07T13:36:19.214138Z",
	"iopub.status.idle": "2023-09-07T13:36:19.454709Z",
	"shell.execute_reply": "2023-09-07T13:36:19.453971Z",
	"shell.execute_reply.started": "2023-09-07T13:36:19.214347Z"
	},
	"tags": []
	},
	"source": [
	"Read data from disk"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "77ed6e89-916b-45a4-bf53-ccdb9f9d0dcf",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.648075Z",
	"iopub.status.busy": "2023-09-08T08:18:10.647910Z",
	"iopub.status.idle": "2023-09-08T08:18:10.744014Z",
	"shell.execute_reply": "2023-09-08T08:18:10.743527Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.648059Z"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"read_df = pd.read_parquet(filename)\n",
	"assert initial_df.equals(read_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "681207dd-c146-44c1-b252-9ba4650b1980",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T13:44:43.906015Z",
	"iopub.status.busy": "2023-09-07T13:44:43.905870Z",
	"iopub.status.idle": "2023-09-07T13:44:44.139911Z",
	"shell.execute_reply": "2023-09-07T13:44:44.138945Z",
	"shell.execute_reply.started": "2023-09-07T13:44:43.905997Z"
	},
	"tags": []
	},
	"source": [
	"### Warning! - Attributes don't seem to be preserved"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "30f725db-e149-4c68-94f0-86c31bc3f8e8",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.744632Z",
	"iopub.status.busy": "2023-09-08T08:18:10.744502Z",
	"iopub.status.idle": "2023-09-08T08:18:10.749195Z",
	"shell.execute_reply": "2023-09-08T08:18:10.748816Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.744610Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{}"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"read_df.attrs"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b2b16734-320a-4be6-bca9-acfb63646894",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T13:38:33.671948Z",
	"iopub.status.busy": "2023-09-07T13:38:33.671833Z",
	"iopub.status.idle": "2023-09-07T13:38:33.908914Z",
	"shell.execute_reply": "2023-09-07T13:38:33.908171Z",
	"shell.execute_reply.started": "2023-09-07T13:38:33.671936Z"
	},
	"tags": []
	},
	"source": [
	"### Append data to disk"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "9fa09491-c95d-4b5e-8f85-a3a2e745b767",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.749816Z",
	"iopub.status.busy": "2023-09-08T08:18:10.749690Z",
	"iopub.status.idle": "2023-09-08T08:18:10.991365Z",
	"shell.execute_reply": "2023-09-08T08:18:10.990017Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.749804Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 panos panos 949 Sep 8 11:18 /tmp/df.parquet\n",
	"-rw-r--r-- 1 panos panos 1.3K Sep 8 11:18 /tmp/df.parquet\n"
	]
	}
	],
	"source": [
	"!ls -lah {filename}\n",
	"to_be_appended_df.to_parquet(filename, engine=\"fastparquet\", append=True)\n",
	"!ls -lah {filename}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "f988b009-9504-4a8e-803d-5a52a36e5a28",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:10.995429Z",
	"iopub.status.busy": "2023-09-08T08:18:10.995019Z",
	"iopub.status.idle": "2023-09-08T08:18:11.007706Z",
	"shell.execute_reply": "2023-09-08T08:18:11.006968Z",
	"shell.execute_reply.started": "2023-09-08T08:18:10.995395Z"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"read_appended_df = pd.read_parquet(filename)\n",
	"assert pd.concat([initial_df, to_be_appended_df]).equals(read_appended_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "8cf4af61-1e10-4f61-b068-807c278b501c",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T13:39:16.153860Z",
	"iopub.status.busy": "2023-09-07T13:39:16.153627Z",
	"iopub.status.idle": "2023-09-07T13:39:16.161716Z",
	"shell.execute_reply": "2023-09-07T13:39:16.161183Z",
	"shell.execute_reply.started": "2023-09-07T13:39:16.153840Z"
	},
	"tags": []
	},
	"source": [
	"## Warning\n",
	"\n",
	"It is possible to append multiple times the same data, i.e. you can end up with duplicates in the index.\n",
	"\n",
	"It is up to us to ensure that we have a monotonic index."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "9e68162e-28a5-49e4-b75a-ed89ff3191a7",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:11.009039Z",
	"iopub.status.busy": "2023-09-08T08:18:11.008763Z",
	"iopub.status.idle": "2023-09-08T08:18:11.738002Z",
	"shell.execute_reply": "2023-09-08T08:18:11.737137Z",
	"shell.execute_reply.started": "2023-09-08T08:18:11.009014Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"-rw-r--r-- 1 panos panos 1.3K Sep 8 11:18 /tmp/df.parquet\n",
	"-rw-r--r-- 1 panos panos 1.6K Sep 8 11:18 /tmp/df.parquet\n",
	"-rw-r--r-- 1 panos panos 1.9K Sep 8 11:18 /tmp/df.parquet\n",
	"-rw-r--r-- 1 panos panos 2.2K Sep 8 11:18 /tmp/df.parquet\n",
	"-rw-r--r-- 1 panos panos 2.5K Sep 8 11:18 /tmp/df.parquet\n",
	"-rw-r--r-- 1 panos panos 2.8K Sep 8 11:18 /tmp/df.parquet\n"
	]
	}
	],
	"source": [
	"for i in range(5):\n",
	" !ls -lah {filename}\n",
	" to_be_appended_df.to_parquet(filename, engine=\"fastparquet\", append=True)\n",
	"!ls -lah {filename}"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a5d39d7d-8454-42b4-88f6-0a49fcdcd6f8",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T14:16:07.432437Z",
	"iopub.status.busy": "2023-09-07T14:16:07.432113Z",
	"iopub.status.idle": "2023-09-07T14:16:07.438124Z",
	"shell.execute_reply": "2023-09-07T14:16:07.437070Z",
	"shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
	},
	"tags": []
	},
	"source": [
	"### How parquet stores data to disk\n",
	"\n",
	"`parquet` stores data column-wise. The data of each column are stored in chunks named \"row-groups\".\n",
	"\n",
	"The default value for each \"row-group\" in `fastparquet` is 50_000_000, but this value can be changed,\n",
	"and, this value can be different for each \"row-group\".\n",
	"This can be useful if for example you want each \"row-group\" to contain the data of a whole year.\n",
	"\n",
	"### Efficient reading of metadata\n",
	"\n",
	"Parquet files contain statistics for each \"row-group\". You can access these statistics like this:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "ac54109e-f3fb-4f38-953a-3f77a39cd138",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:11.739389Z",
	"iopub.status.busy": "2023-09-08T08:18:11.739165Z",
	"iopub.status.idle": "2023-09-08T08:18:11.745377Z",
	"shell.execute_reply": "2023-09-08T08:18:11.744861Z",
	"shell.execute_reply.started": "2023-09-08T08:18:11.739367Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dict_keys(['min', 'max', 'null_count', 'distinct_count'])"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pf = fastparquet.ParquetFile(filename)\n",
	"pf.statistics.keys()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "1a742201-6d78-4fe7-8c8b-12146e6527c1",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T14:16:07.432437Z",
	"iopub.status.busy": "2023-09-07T14:16:07.432113Z",
	"iopub.status.idle": "2023-09-07T14:16:07.438124Z",
	"shell.execute_reply": "2023-09-07T14:16:07.437070Z",
	"shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
	},
	"tags": []
	},
	"source": [
	"For example, the maximum value of the first two \"row-groups\" of column `rad` are:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "fd7f0b30-d8f4-4e8c-b50b-f103d699281d",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:11.746539Z",
	"iopub.status.busy": "2023-09-08T08:18:11.746102Z",
	"iopub.status.idle": "2023-09-08T08:18:11.758707Z",
	"shell.execute_reply": "2023-09-08T08:18:11.758066Z",
	"shell.execute_reply.started": "2023-09-08T08:18:11.746519Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5.0"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	},
	{
	"data": {
	"text/plain": [
	"5.0"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pf.statistics[\"max\"][\"rad\"][0]\n",
	"pf.statistics[\"max\"][\"rad\"][1]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ad546f83-35cc-461a-b572-2b0831ea6350",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T14:16:07.432437Z",
	"iopub.status.busy": "2023-09-07T14:16:07.432113Z",
	"iopub.status.idle": "2023-09-07T14:16:07.438124Z",
	"shell.execute_reply": "2023-09-07T14:16:07.437070Z",
	"shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
	},
	"tags": []
	},
	"source": [
	"The maximum rad value is:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "ee9b7ace-9b81-481e-92e8-ad2c9a262a91",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:11.759638Z",
	"iopub.status.busy": "2023-09-08T08:18:11.759440Z",
	"iopub.status.idle": "2023-09-08T08:18:11.763719Z",
	"shell.execute_reply": "2023-09-08T08:18:11.763172Z",
	"shell.execute_reply.started": "2023-09-08T08:18:11.759618Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"5.0"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"max(pf.statistics[\"max\"][\"rad\"])"
	]
	},
	{
	"cell_type": "markdown",
	"id": "7d0828d2-27fc-44a1-b865-e542e55116dd",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T14:16:07.432437Z",
	"iopub.status.busy": "2023-09-07T14:16:07.432113Z",
	"iopub.status.idle": "2023-09-07T14:16:07.438124Z",
	"shell.execute_reply": "2023-09-07T14:16:07.437070Z",
	"shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
	},
	"tags": []
	},
	"source": [
	"This can be very useful in our case because it allows us to get the most recent timestamp in a very efficient way (without reading/decompressing the full file from the disk):"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "5663631c-b2d0-48dc-9fe7-57e8706bf48a",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:11.764608Z",
	"iopub.status.busy": "2023-09-08T08:18:11.764410Z",
	"iopub.status.idle": "2023-09-08T08:18:11.769240Z",
	"shell.execute_reply": "2023-09-08T08:18:11.768688Z",
	"shell.execute_reply.started": "2023-09-08T08:18:11.764579Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"numpy.datetime64('2001-01-02T00:09:00.000000000')"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"most_recent_ts = pf.statistics[\"max\"][\"time\"][-1]\n",
	"most_recent_ts"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ab4cdfc9-58a4-4dbf-8c41-f3c2ce462db3",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-07T14:16:07.432437Z",
	"iopub.status.busy": "2023-09-07T14:16:07.432113Z",
	"iopub.status.idle": "2023-09-07T14:16:07.438124Z",
	"shell.execute_reply": "2023-09-07T14:16:07.437070Z",
	"shell.execute_reply.started": "2023-09-07T14:16:07.432412Z"
	},
	"tags": []
	},
	"source": [
	"## Efficient reading of \"row-groups\"\n",
	"\n",
	"If you know that you only need the data of a specific \"row-group\" then you can avoid reading/decompressing the whole file from disk:\n",
	"\n",
	"For that we need to ditch the `pandas` interface and directly use the `fastparquet` one, though:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "088651c4-cc38-45fc-a994-66699d0ebb6a",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T08:18:11.770025Z",
	"iopub.status.busy": "2023-09-08T08:18:11.769846Z",
	"iopub.status.idle": "2023-09-08T08:18:11.780828Z",
	"shell.execute_reply": "2023-09-08T08:18:11.780252Z",
	"shell.execute_reply.started": "2023-09-08T08:18:11.770007Z"
	},
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(70, 10)"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pf = fastparquet.ParquetFile(filename)\n",
	"\n",
	"full_df = pf.to_pandas()\n",
	"last_row_group_df = pf[-1].to_pandas()\n",
	"\n",
	"len(full_df), len(last_row_group_df)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "8c551352-d52a-4369-952c-7bc2b485adc7",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2023-09-08T07:17:33.142315Z",
	"iopub.status.busy": "2023-09-08T07:17:33.142063Z",
	"iopub.status.idle": "2023-09-08T07:17:33.150592Z",
	"shell.execute_reply": "2023-09-08T07:17:33.150098Z",
	"shell.execute_reply.started": "2023-09-08T07:17:33.142297Z"
	},
	"tags": []
	},
	"source": [
	"Parquet supports additional ways that can speed up reading of files like filters etc, but I don't think that they are relevant in our case.\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "scraper",
	"language": "python",
	"name": "scraper"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}