ajfriend · August 27, 2024 04:36 · nwh · Aug 28, 2024
diff --git a/monkey_pipes.ipynb b/monkey_pipes.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0f439479-2283-4817-b7a6-b527a5cd4195",
   "metadata": {},
   "source": [
    "# Almost recreating Google \"pipe syntax\" SQL with DuckDB and Python monkey patching\n",
    "\n",
    "- Inspired by [SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL](https://research.google/pubs/sql-has-problems-we-can-fix-them-pipe-syntax-in-sql/)\n",
    "- How close can we get to a similar \"pipe syntax\" using Python monkey patching and DuckDB?\n",
    "- Pretty close! Each \"operator\" is a SQL snippet, where `select ...` and `from ...` can usually be omitted\n",
    "- Note: This is pretty similar to DuckDB's [relational API](https://duckdb.org/docs/guides/python/relational_api_pandas.html), but what I like about this approach is that I can just write SQL without having to learn a new Python API. Also, that SQL can be arbitrarily complex.\n",
    "\n",
    "## Open questions\n",
    "\n",
    "- This was a quick hack. Can we improve the syntax?\n",
    "- Multiple tables/relations? Joins?\n",
    "- Are these expressions being built up lazily?\n",
    "- I'd really like to not have to create a random name for the intermediate table views? Is there a better way?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "aa088c73-0d48-4f93-bc12-91556e461c40",
   "metadata": {},
   "outputs": [],
   "source": [
    "import duckdb\n",
    "import pandas as pd\n",
    "\n",
    "con = duckdb.connect(\n",
    "    database = ':memory:',\n",
    "    config = {'enable_external_access': False},\n",
    ")\n",
    "\n",
    "iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "02cfcdfb-10bd-411f-8dd3-c7e741d25b4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# do some monkey patching to provide an \"or\" operator for `DuckDBPyRelation`s\n",
    "\n",
    "from duckdb import DuckDBPyRelation\n",
    "import random\n",
    "import string\n",
    "\n",
    "def _my_or(rel, sql_string):\n",
    "    # we need a random name for referring to the table. better way?\n",
    "    name = '_tlb_' + ''.join(random.choices(string.ascii_lowercase, k=10))\n",
    "    return rel.query(name, f'from {name} ' + sql_string)\n",
    "\n",
    "DuckDBPyRelation.__or__ = _my_or"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "57a8c8b3-a515-4170-90f1-ef64dac8dddf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐\n",
       "│ sepal_length │ sepal_width │ petal_length │ petal_width │  species  │\n",
       "│    double    │   double    │    double    │   double    │  varchar  │\n",
       "├──────────────┼─────────────┼──────────────┼─────────────┼───────────┤\n",
       "│          5.1 │         3.5 │          1.4 │         0.2 │ setosa    │\n",
       "│          4.9 │         3.0 │          1.4 │         0.2 │ setosa    │\n",
       "│          4.7 │         3.2 │          1.3 │         0.2 │ setosa    │\n",
       "│          4.6 │         3.1 │          1.5 │         0.2 │ setosa    │\n",
       "│          5.0 │         3.6 │          1.4 │         0.2 │ setosa    │\n",
       "│          5.4 │         3.9 │          1.7 │         0.4 │ setosa    │\n",
       "│          4.6 │         3.4 │          1.4 │         0.3 │ setosa    │\n",
       "│          5.0 │         3.4 │          1.5 │         0.2 │ setosa    │\n",
       "│          4.4 │         2.9 │          1.4 │         0.2 │ setosa    │\n",
       "│          4.9 │         3.1 │          1.5 │         0.1 │ setosa    │\n",
       "│           ·  │          ·  │           ·  │          ·  │   ·       │\n",
       "│           ·  │          ·  │           ·  │          ·  │   ·       │\n",
       "│           ·  │          ·  │           ·  │          ·  │   ·       │\n",
       "│          6.7 │         3.1 │          5.6 │         2.4 │ virginica │\n",
       "│          6.9 │         3.1 │          5.1 │         2.3 │ virginica │\n",
       "│          5.8 │         2.7 │          5.1 │         1.9 │ virginica │\n",
       "│          6.8 │         3.2 │          5.9 │         2.3 │ virginica │\n",
       "│          6.7 │         3.3 │          5.7 │         2.5 │ virginica │\n",
       "│          6.7 │         3.0 │          5.2 │         2.3 │ virginica │\n",
       "│          6.3 │         2.5 │          5.0 │         1.9 │ virginica │\n",
       "│          6.5 │         3.0 │          5.2 │         2.0 │ virginica │\n",
       "│          6.2 │         3.4 │          5.4 │         2.3 │ virginica │\n",
       "│          5.9 │         3.0 │          5.1 │         1.8 │ virginica │\n",
       "├──────────────┴─────────────┴──────────────┴─────────────┴───────────┤\n",
       "│ 150 rows (20 shown)                                       5 columns │\n",
       "└─────────────────────────────────────────────────────────────────────┘"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rel = con.from_df(iris)\n",
    "rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cb7c7353-c9fa-42ef-bfb2-204196c0c5d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "┌────────────┬───────────────────┬─────────┐\n",
       "│  species   │      avg(sl)      │ max(sw) │\n",
       "│  varchar   │      double       │ double  │\n",
       "├────────────┼───────────────────┼─────────┤\n",
       "│ setosa     │              4.98 │     3.5 │\n",
       "│ versicolor │ 6.487500000000001 │     3.4 │\n",
       "└────────────┴───────────────────┴─────────┘"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can chain together SQL expressions in \"pipe syntax\" style.\n",
    "# We can often use short SQL expressions/snippets because\n",
    "# we insert `from {tbl}`, and `select *` is optional in DuckDB.\n",
    "\n",
    "(rel\n",
    "| 'select species, sepal_length as sl, sepal_width as sw'\n",
    "| 'where sl > 4.5'\n",
    "| 'where sw between 3.1 and 3.5'\n",
    "| 'select species, avg(sl), max(sw) group by all'\n",
    "| 'order by 2'\n",
    "| 'limit 2'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4226db36-9811-4ba8-84cd-f4645d7e88b9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "0f439479-2283-4817-b7a6-b527a5cd4195",
	"metadata": {},
	"source": [
	"# Almost recreating Google \"pipe syntax\" SQL with DuckDB and Python monkey patching\n",
	"\n",
	"- Inspired by [SQL Has Problems. We Can Fix Them: Pipe Syntax In SQL](https://research.google/pubs/sql-has-problems-we-can-fix-them-pipe-syntax-in-sql/)\n",
	"- How close can we get to a similar \"pipe syntax\" using Python monkey patching and DuckDB?\n",
	"- Pretty close! Each \"operator\" is a SQL snippet, where `select ...` and `from ...` can usually be omitted\n",
	"- Note: This is pretty similar to DuckDB's [relational API](https://duckdb.org/docs/guides/python/relational_api_pandas.html), but what I like about this approach is that I can just write SQL without having to learn a new Python API. Also, that SQL can be arbitrarily complex.\n",
	"\n",
	"## Open questions\n",
	"\n",
	"- This was a quick hack. Can we improve the syntax?\n",
	"- Multiple tables/relations? Joins?\n",
	"- Are these expressions being built up lazily?\n",
	"- I'd really like to not have to create a random name for the intermediate table views? Is there a better way?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "aa088c73-0d48-4f93-bc12-91556e461c40",
	"metadata": {},
	"outputs": [],
	"source": [
	"import duckdb\n",
	"import pandas as pd\n",
	"\n",
	"con = duckdb.connect(\n",
	" database = ':memory:',\n",
	" config = {'enable_external_access': False},\n",
	")\n",
	"\n",
	"iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "02cfcdfb-10bd-411f-8dd3-c7e741d25b4e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# do some monkey patching to provide an \"or\" operator for `DuckDBPyRelation`s\n",
	"\n",
	"from duckdb import DuckDBPyRelation\n",
	"import random\n",
	"import string\n",
	"\n",
	"def _my_or(rel, sql_string):\n",
	" # we need a random name for referring to the table. better way?\n",
	" name = '_tlb_' + ''.join(random.choices(string.ascii_lowercase, k=10))\n",
	" return rel.query(name, f'from {name} ' + sql_string)\n",
	"\n",
	"DuckDBPyRelation.__or__ = _my_or"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "57a8c8b3-a515-4170-90f1-ef64dac8dddf",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐\n",
	"│ sepal_length │ sepal_width │ petal_length │ petal_width │ species │\n",
	"│ double │ double │ double │ double │ varchar │\n",
	"├──────────────┼─────────────┼──────────────┼─────────────┼───────────┤\n",
	"│ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │\n",
	"│ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │\n",
	"│ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │\n",
	"│ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │\n",
	"│ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │\n",
	"│ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │\n",
	"│ 4.6 │ 3.4 │ 1.4 │ 0.3 │ setosa │\n",
	"│ 5.0 │ 3.4 │ 1.5 │ 0.2 │ setosa │\n",
	"│ 4.4 │ 2.9 │ 1.4 │ 0.2 │ setosa │\n",
	"│ 4.9 │ 3.1 │ 1.5 │ 0.1 │ setosa │\n",
	"│ · │ · │ · │ · │ · │\n",
	"│ · │ · │ · │ · │ · │\n",
	"│ · │ · │ · │ · │ · │\n",
	"│ 6.7 │ 3.1 │ 5.6 │ 2.4 │ virginica │\n",
	"│ 6.9 │ 3.1 │ 5.1 │ 2.3 │ virginica │\n",
	"│ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │\n",
	"│ 6.8 │ 3.2 │ 5.9 │ 2.3 │ virginica │\n",
	"│ 6.7 │ 3.3 │ 5.7 │ 2.5 │ virginica │\n",
	"│ 6.7 │ 3.0 │ 5.2 │ 2.3 │ virginica │\n",
	"│ 6.3 │ 2.5 │ 5.0 │ 1.9 │ virginica │\n",
	"│ 6.5 │ 3.0 │ 5.2 │ 2.0 │ virginica │\n",
	"│ 6.2 │ 3.4 │ 5.4 │ 2.3 │ virginica │\n",
	"│ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │\n",
	"├──────────────┴─────────────┴──────────────┴─────────────┴───────────┤\n",
	"│ 150 rows (20 shown) 5 columns │\n",
	"└─────────────────────────────────────────────────────────────────────┘"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"rel = con.from_df(iris)\n",
	"rel"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "cb7c7353-c9fa-42ef-bfb2-204196c0c5d8",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"┌────────────┬───────────────────┬─────────┐\n",
	"│ species │ avg(sl) │ max(sw) │\n",
	"│ varchar │ double │ double │\n",
	"├────────────┼───────────────────┼─────────┤\n",
	"│ setosa │ 4.98 │ 3.5 │\n",
	"│ versicolor │ 6.487500000000001 │ 3.4 │\n",
	"└────────────┴───────────────────┴─────────┘"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# We can chain together SQL expressions in \"pipe syntax\" style.\n",
	"# We can often use short SQL expressions/snippets because\n",
	"# we insert `from {tbl}`, and `select *` is optional in DuckDB.\n",
	"\n",
	"(rel\n",
	"\| 'select species, sepal_length as sl, sepal_width as sw'\n",
	"\| 'where sl > 4.5'\n",
	"\| 'where sw between 3.1 and 3.5'\n",
	"\| 'select species, avg(sl), max(sw) group by all'\n",
	"\| 'order by 2'\n",
	"\| 'limit 2'\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4226db36-9811-4ba8-84cd-f4645d7e88b9",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}