jorisvandenbossche · October 13, 2023 10:55
diff --git a/pyogrio-read-fids.ipynb b/pyogrio-read-fids.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# pyogrio - performance of reading with array of FIDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR 1: PROJ: proj_create_from_database: Open of /home/joris/miniconda3/envs/geo-dev2/share/proj failed\n"
     ]
    }
   ],
   "source": [
    "import geopandas\n",
    "import geopandas.testing\n",
    "\n",
    "import pyogrio\n",
    "\n",
    "# import pyproj\n",
    "# pyproj.datadir.set_data_dir(\"/home/joris/miniconda3/envs/geo-dev2/share/proj/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.6.0+27.gecfcb85'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pyogrio.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 6, 2)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pyogrio.__gdal_version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating data\n",
    "\n",
    "Creating a very simple file (point geometries, single integer attribute field), so that we mostly measure the overhead from reading all vs reading with FIDs (limiting the time to parse geometries and fields):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "N = 100_000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "gdf = geopandas.GeoDataFrame({\"col\": range(N), \"geometry\": geopandas.points_from_xy(np.random.randn(N), np.random.randn(N))}, crs=\"EPSG:4326\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.gpkg\", driver=\"GPKG\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.shp\", driver=\"ESRI Shapefile\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.geojson\", driver=\"GeoJSON\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.fgb\", driver=\"FlatGeobuf\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Capabilities of the different file formats:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>shp</th>\n",
       "      <th>gpkg</th>\n",
       "      <th>geojson</th>\n",
       "      <th>fgb</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>random_read</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fast_set_next_by_index</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fast_spatial_filter</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fast_feature_count</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fast_total_bounds</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          shp   gpkg  geojson    fgb\n",
       "random_read              True   True     True   True\n",
       "fast_set_next_by_index   True  False     True  False\n",
       "fast_spatial_filter     False   True    False   True\n",
       "fast_feature_count       True   True     True   True\n",
       "fast_total_bounds        True   True    False   True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "capabilities = {}\n",
    "for ext in [\"shp\", \"gpkg\", \"geojson\", \"fgb\"]:\n",
    "    capabilities[ext] = pyogrio.read_info(f\"benchmark-data/test_points.{ext}\")[\"capabilities\"]\n",
    "pd.DataFrame(capabilities)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmark reading performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "N = 100_000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices = np.arange(N, dtype=\"int32\")\n",
    "indices_shuffled = np.arange(N, dtype=\"int32\")\n",
    "np.random.shuffle(indices_shuffled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "res1 = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\")\n",
    "res2 = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices)\n",
    "geopandas.testing.assert_geodataframe_equal(res1, res2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Reading Shapefile:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "181 ms ± 30.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "133 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "401 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices_shuffled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reading with `where` clause: only with subset of indices, as with all the full array we get *\"Invalid SQL query for layer 'b'test_points'': FID in ...\"*, and it's also very slow anyway:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "where_clause = f\"FID in ({', '.join(map(str, indices[:4000].tolist()))})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.49 s ± 608 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", where=where_clause)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Reading GeoPackage:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "184 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GPKG starts to count at 1\n",
    "indices_gpkg = indices + 1\n",
    "indices_shuffled_gpkg = indices_shuffled + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "720 ms ± 36.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", fids=indices_gpkg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "976 ms ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", fids=indices_shuffled_gpkg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "where_clause = f\"FID in ({', '.join(map(str, (indices_gpkg).tolist()))})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "443 ms ± 61.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", where=where_clause)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Reading GeoJSON:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.1 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.82 s ± 119 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", fids=indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.87 s ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", fids=indices_shuffled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "where_clause = f\"FID in ({', '.join(map(str, indices[:1000].tolist()))})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.5 s ± 55.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", where=where_clause)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Reading FlatGeobuf:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "137 ms ± 5.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "362 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\", fids=indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "425 ms ± 21.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\", fids=indices_shuffled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Only subset, with >5000 values, you get invalid SQL query, and it's very slow anyway:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "where_clause = f\"FID in ({', '.join(map(str, indices[:4000].tolist()))})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.28 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\", where=where_clause)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading batches + filtering on the fly for Shapefile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "indices_subset = np.sort(indices_shuffled[:1000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58.9 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "import pyarrow as pa\n",
    "import pyarrow.compute as pc\n",
    "\n",
    "fids_set = pa.array(indices_subset, pa.int64())\n",
    "\n",
    "with pyogrio.raw.open_arrow(\"benchmark-data/test_points.shp\", return_fids=True) as source:\n",
    "    meta, reader = source\n",
    "    \n",
    "    batches = []\n",
    "    count = 0\n",
    "    while True:\n",
    "        try:\n",
    "            batch = reader.read_next_batch()\n",
    "            batch = batch.filter(pc.is_in(batch[\"OGC_FID\"], fids_set))\n",
    "            batches.append(batch)\n",
    "\n",
    "            # ount += len(batch)\n",
    "            # if count >= (skip_features + max_features):\n",
    "            #     break\n",
    "\n",
    "        except StopIteration:\n",
    "            break\n",
    "\n",
    "    # use combine_chunks to release the original memory that included\n",
    "    # too many features\n",
    "    table = (\n",
    "        pa.Table.from_batches(batches, schema=reader.schema)\n",
    "        # .slice(skip_features, max_features)\n",
    "        .combine_chunks()\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "where_clause = f\"FID in ({', '.join(map(str, indices_subset.tolist()))})\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "541 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.raw.read_arrow(\"benchmark-data/test_points.shp\", where=where_clause)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.59 ms ± 456 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.raw.read(\"benchmark-data/test_points.shp\", fids=indices_subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading with bbox vs FIDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "gdf = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from shapely.geometry import box"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "subset = gdf[gdf.intersects(box(0, 0, 1, 1))].reset_index(drop=True)\n",
    "indices_subset = np.asarray(subset[\"col\"], dtype=\"int32\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.11667"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(subset) / len(gdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", bbox=(0, 0, 1, 1))\n",
    "geopandas.testing.assert_geodataframe_equal(res, subset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Shapefile:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "88.2 ms ± 4.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", bbox=(0, 0, 1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "43.6 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices_subset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "GeoPackage:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "81.6 ms ± 2.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", bbox=(0, 0, 1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "301 ms ± 6.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", fids=indices_subset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "GeoJSON:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.99 s ± 28.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", bbox=(0, 0, 1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.24 s ± 7.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", fids=indices_subset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmark with complex geometries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The `tl_2019_us_zcta510` shapefile rewritten as GeoPackage: this dataset has complex polygon geometries and several attribute fields."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "239 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_us_zcta.gpkg\", max_features=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "282 ms ± 4.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit pyogrio.read_dataframe(\"benchmark-data/test_us_zcta.gpkg\", fids=np.arange(1, 1001, dtype=\"int32\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So in this case the overhead of reading by FID is less significant."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (geo-dev2)",
   "language": "python",
   "name": "geo-dev2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# pyogrio - performance of reading with array of FIDs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"ERROR 1: PROJ: proj_create_from_database: Open of /home/joris/miniconda3/envs/geo-dev2/share/proj failed\n"
	]
	}
	],
	"source": [
	"import geopandas\n",
	"import geopandas.testing\n",
	"\n",
	"import pyogrio\n",
	"\n",
	"# import pyproj\n",
	"# pyproj.datadir.set_data_dir(\"/home/joris/miniconda3/envs/geo-dev2/share/proj/\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'0.6.0+27.gecfcb85'"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pyogrio.__version__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(3, 6, 2)"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pyogrio.__gdal_version__"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Creating data\n",
	"\n",
	"Creating a very simple file (point geometries, single integer attribute field), so that we mostly measure the overhead from reading all vs reading with FIDs (limiting the time to parse geometries and fields):"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"N = 100_000"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"gdf = geopandas.GeoDataFrame({\"col\": range(N), \"geometry\": geopandas.points_from_xy(np.random.randn(N), np.random.randn(N))}, crs=\"EPSG:4326\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.gpkg\", driver=\"GPKG\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.shp\", driver=\"ESRI Shapefile\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.geojson\", driver=\"GeoJSON\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"pyogrio.write_dataframe(gdf, \"benchmark-data/test_points.fgb\", driver=\"FlatGeobuf\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Capabilities of the different file formats:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>shp</th>\n",
	" <th>gpkg</th>\n",
	" <th>geojson</th>\n",
	" <th>fgb</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>random_read</th>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>fast_set_next_by_index</th>\n",
	" <td>True</td>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>fast_spatial_filter</th>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>fast_feature_count</th>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>fast_total_bounds</th>\n",
	" <td>True</td>\n",
	" <td>True</td>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" shp gpkg geojson fgb\n",
	"random_read True True True True\n",
	"fast_set_next_by_index True False True False\n",
	"fast_spatial_filter False True False True\n",
	"fast_feature_count True True True True\n",
	"fast_total_bounds True True False True"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"capabilities = {}\n",
	"for ext in [\"shp\", \"gpkg\", \"geojson\", \"fgb\"]:\n",
	" capabilities[ext] = pyogrio.read_info(f\"benchmark-data/test_points.{ext}\")[\"capabilities\"]\n",
	"pd.DataFrame(capabilities)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Benchmark reading performance"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"N = 100_000"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"indices = np.arange(N, dtype=\"int32\")\n",
	"indices_shuffled = np.arange(N, dtype=\"int32\")\n",
	"np.random.shuffle(indices_shuffled)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"res1 = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\")\n",
	"res2 = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices)\n",
	"geopandas.testing.assert_geodataframe_equal(res1, res2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Reading Shapefile:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"181 ms ± 30.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"133 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"401 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices_shuffled)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Reading with `where` clause: only with subset of indices, as with all the full array we get \"Invalid SQL query for layer 'b'test_points'': FID in ...\", and it's also very slow anyway:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"where_clause = f\"FID in ({', '.join(map(str, indices[:4000].tolist()))})\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3.49 s ± 608 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", where=where_clause)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Reading GeoPackage:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"184 ms ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"# GPKG starts to count at 1\n",
	"indices_gpkg = indices + 1\n",
	"indices_shuffled_gpkg = indices_shuffled + 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"720 ms ± 36.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", fids=indices_gpkg)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"976 ms ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", fids=indices_shuffled_gpkg)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"where_clause = f\"FID in ({', '.join(map(str, (indices_gpkg).tolist()))})\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"443 ms ± 61.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", where=where_clause)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Reading GeoJSON:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.1 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.82 s ± 119 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", fids=indices)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.87 s ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", fids=indices_shuffled)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"where_clause = f\"FID in ({', '.join(map(str, indices[:1000].tolist()))})\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.5 s ± 55.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", where=where_clause)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Reading FlatGeobuf:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"137 ms ± 5.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"362 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\", fids=indices)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"425 ms ± 21.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\", fids=indices_shuffled)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Only subset, with >5000 values, you get invalid SQL query, and it's very slow anyway:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"where_clause = f\"FID in ({', '.join(map(str, indices[:4000].tolist()))})\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3.28 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.fgb\", where=where_clause)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Reading batches + filtering on the fly for Shapefile"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"indices_subset = np.sort(indices_shuffled[:1000])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"58.9 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"import pyarrow as pa\n",
	"import pyarrow.compute as pc\n",
	"\n",
	"fids_set = pa.array(indices_subset, pa.int64())\n",
	"\n",
	"with pyogrio.raw.open_arrow(\"benchmark-data/test_points.shp\", return_fids=True) as source:\n",
	" meta, reader = source\n",
	" \n",
	" batches = []\n",
	" count = 0\n",
	" while True:\n",
	" try:\n",
	" batch = reader.read_next_batch()\n",
	" batch = batch.filter(pc.is_in(batch[\"OGC_FID\"], fids_set))\n",
	" batches.append(batch)\n",
	"\n",
	" # ount += len(batch)\n",
	" # if count >= (skip_features + max_features):\n",
	" # break\n",
	"\n",
	" except StopIteration:\n",
	" break\n",
	"\n",
	" # use combine_chunks to release the original memory that included\n",
	" # too many features\n",
	" table = (\n",
	" pa.Table.from_batches(batches, schema=reader.schema)\n",
	" # .slice(skip_features, max_features)\n",
	" .combine_chunks()\n",
	" )\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [],
	"source": [
	"where_clause = f\"FID in ({', '.join(map(str, indices_subset.tolist()))})\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"541 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.raw.read_arrow(\"benchmark-data/test_points.shp\", where=where_clause)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"tags": []
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"2.59 ms ± 456 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.raw.read(\"benchmark-data/test_points.shp\", fids=indices_subset)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Reading with bbox vs FIDs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [],
	"source": [
	"gdf = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [],
	"source": [
	"from shapely.geometry import box"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [],
	"source": [
	"subset = gdf[gdf.intersects(box(0, 0, 1, 1))].reset_index(drop=True)\n",
	"indices_subset = np.asarray(subset[\"col\"], dtype=\"int32\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.11667"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(subset) / len(gdf)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"res = pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", bbox=(0, 0, 1, 1))\n",
	"geopandas.testing.assert_geodataframe_equal(res, subset)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Shapefile:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"88.2 ms ± 4.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", bbox=(0, 0, 1, 1))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"43.6 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.shp\", fids=indices_subset)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"GeoPackage:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"81.6 ms ± 2.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", bbox=(0, 0, 1, 1))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"301 ms ± 6.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.gpkg\", fids=indices_subset)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"GeoJSON:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.99 s ± 28.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", bbox=(0, 0, 1, 1))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.24 s ± 7.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_points.geojson\", fids=indices_subset)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Benchmark with complex geometries"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The `tl_2019_us_zcta510` shapefile rewritten as GeoPackage: this dataset has complex polygon geometries and several attribute fields."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"239 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_us_zcta.gpkg\", max_features=1000)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"282 ms ± 4.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%timeit pyogrio.read_dataframe(\"benchmark-data/test_us_zcta.gpkg\", fids=np.arange(1, 1001, dtype=\"int32\"))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"So in this case the overhead of reading by FID is less significant."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python (geo-dev2)",
	"language": "python",
	"name": "geo-dev2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.0"
	},
	"widgets": {
	"application/vnd.jupyter.widget-state+json": {
	"state": {},
	"version_major": 2,
	"version_minor": 0
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}