Last active
December 19, 2024 21:55
-
-
Save dataders/ef982a2e6eae5d125705717fcce80111 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "extensionUris": [ | |
| { | |
| "extensionUriAnchor": 1, | |
| "uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml" | |
| } | |
| ], | |
| "extensions": [ | |
| { | |
| "extensionFunction": { | |
| "extensionUriReference": 1, | |
| "functionAnchor": 1, | |
| "name": "multiply:i32_i32" | |
| } | |
| } | |
| ], | |
| "relations": [ | |
| { | |
| "root": { | |
| "input": { | |
| "project": { | |
| "input": { | |
| "read": { | |
| "baseSchema": { | |
| "names": [ | |
| "unit_price", | |
| "quantity" | |
| ], | |
| "struct": { | |
| "types": [ | |
| { | |
| "i32": { | |
| "nullability": "NULLABILITY_NULLABLE" | |
| } | |
| }, | |
| { | |
| "i32": { | |
| "nullability": "NULLABILITY_NULLABLE" | |
| } | |
| } | |
| ], | |
| "nullability": "NULLABILITY_REQUIRED" | |
| } | |
| }, | |
| "projection": { | |
| "select": { | |
| "structItems": [ | |
| {}, | |
| { | |
| "field": 1 | |
| } | |
| ] | |
| }, | |
| "maintainSingularStruct": true | |
| }, | |
| "namedTable": { | |
| "names": [ | |
| "payments" | |
| ] | |
| } | |
| } | |
| }, | |
| "expressions": [ | |
| { | |
| "scalarFunction": { | |
| "functionReference": 1, | |
| "outputType": { | |
| "i32": { | |
| "nullability": "NULLABILITY_NULLABLE" | |
| } | |
| }, | |
| "arguments": [ | |
| { | |
| "value": { | |
| "selection": { | |
| "directReference": { | |
| "structField": {} | |
| }, | |
| "rootReference": {} | |
| } | |
| } | |
| }, | |
| { | |
| "value": { | |
| "selection": { | |
| "directReference": { | |
| "structField": { | |
| "field": 1 | |
| } | |
| }, | |
| "rootReference": {} | |
| } | |
| } | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| "names": [ | |
| "total_cost" | |
| ] | |
| } | |
| } | |
| ], | |
| "version": { | |
| "minorNumber": 53, | |
| "producer": "DuckDB" | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# reprex: duckdb -> pyarrow\n", | |
| "\n", | |
| "trying to use a DuckDB-genrated subait plan on a PyArrow table, but it fails with the following error:\n", | |
| "\n", | |
| "```py\n", | |
| "ArrowNotImplementedError: substrait::ReadRel::projection\n", | |
| "```" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import duckdb\n", | |
| "import json\n", | |
| "import pyarrow as pa\n", | |
| "from pyarrow.lib import tobytes\n", | |
| "import pyarrow.substrait as substrait" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "con = duckdb.connect()\n", | |
| "# con = duckdb.connect('sandbox_substrait.db')\n", | |
| "con.install_extension(\"substrait\")\n", | |
| "con.load_extension(\"substrait\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## v1 DuckDB query -> Substrait plan -> Arrow table" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "queries = {\n", | |
| " \"create\": \"CREATE OR REPLACE TABLE payments (unit_price int, quantity int);\",\n", | |
| " \"insert\": \"INSERT INTO payments VALUES (10, 1000), (5,2000), (3456,100);\",\n", | |
| " \"select\": \"select * from payments;\",\n", | |
| " \"transform\": \"select unit_price * quantity as total_cost from payments;\"\n", | |
| "}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pyarrow.Table\n", | |
| "unit_price: int32\n", | |
| "quantity: int32\n", | |
| "----\n", | |
| "unit_price: [[10,5,3456]]\n", | |
| "quantity: [[1000,2000,100]]" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "con.execute(f'{queries[\"create\"]} {queries[\"insert\"]}')\n", | |
| "ar_payments = con.execute(queries['select']).fetch_arrow_table()\n", | |
| "ar_payments" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pyarrow.Table\n", | |
| "explain_key: string\n", | |
| "explain_value: string\n", | |
| "----\n", | |
| "explain_key: [[\"physical_plan\"]]\n", | |
| "explain_value: [[\"┌───────────────────────────┐\n", | |
| "│ PROJECTION │\n", | |
| "│ ──────────────────── │\n", | |
| "│ total_cost │\n", | |
| "│ │\n", | |
| "│ ~3 Rows │\n", | |
| "└─────────────┬─────────────┘\n", | |
| "┌─────────────┴─────────────┐\n", | |
| "│ SEQ_SCAN │\n", | |
| "│ ──────────────────── │\n", | |
| "│ payments │\n", | |
| "│ │\n", | |
| "│ Projections: │\n", | |
| "│ unit_price │\n", | |
| "│ quantity │\n", | |
| "│ │\n", | |
| "│ ~3 Rows │\n", | |
| "└───────────────────────────┘\n", | |
| "\"]]" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "con.execute(f\"EXPLAIN {queries['transform']}\").fetch_arrow_table()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'extensionUris': [{'extensionUriAnchor': 1,\n", | |
| " 'uri': 'https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml'}],\n", | |
| " 'extensions': [{'extensionFunction': {'extensionUriReference': 1,\n", | |
| " 'functionAnchor': 1,\n", | |
| " 'name': 'multiply:i32_i32'}}],\n", | |
| " 'relations': [{'root': {'input': {'project': {'input': {'read': {'baseSchema': {'names': ['unit_price',\n", | |
| " 'quantity'],\n", | |
| " 'struct': {'types': [{'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
| " {'i32': {'nullability': 'NULLABILITY_NULLABLE'}}],\n", | |
| " 'nullability': 'NULLABILITY_REQUIRED'}},\n", | |
| " 'projection': {'select': {'structItems': [{}, {'field': 1}]},\n", | |
| " 'maintainSingularStruct': True},\n", | |
| " 'namedTable': {'names': ['payments']}}},\n", | |
| " 'expressions': [{'scalarFunction': {'functionReference': 1,\n", | |
| " 'outputType': {'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
| " 'arguments': [{'value': {'selection': {'directReference': {'structField': {}},\n", | |
| " 'rootReference': {}}}},\n", | |
| " {'value': {'selection': {'directReference': {'structField': {'field': 1}},\n", | |
| " 'rootReference': {}}}}]}}]}},\n", | |
| " 'names': ['total_cost']}}],\n", | |
| " 'version': {'minorNumber': 53, 'producer': 'DuckDB'}}" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "plan_str = con.get_substrait_json(query=queries['transform']).fetchone()[0]\n", | |
| "plan_json_full = json.loads(plan_str)\n", | |
| "plan_json_full" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'project': {'input': {'read': {'baseSchema': {'names': ['unit_price',\n", | |
| " 'quantity'],\n", | |
| " 'struct': {'types': [{'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
| " {'i32': {'nullability': 'NULLABILITY_NULLABLE'}}],\n", | |
| " 'nullability': 'NULLABILITY_REQUIRED'}},\n", | |
| " 'projection': {'select': {'structItems': [{}, {'field': 1}]},\n", | |
| " 'maintainSingularStruct': True},\n", | |
| " 'namedTable': {'names': ['payments']}}},\n", | |
| " 'expressions': [{'scalarFunction': {'functionReference': 1,\n", | |
| " 'outputType': {'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
| " 'arguments': [{'value': {'selection': {'directReference': {'structField': {}},\n", | |
| " 'rootReference': {}}}},\n", | |
| " {'value': {'selection': {'directReference': {'structField': {'field': 1}},\n", | |
| " 'rootReference': {}}}}]}}]}}" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# plan_json_full['relations'][0]['root'].keys()\n", | |
| "plan_json_full['relations'][0]['root']['input']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def table_provider(names, schema):\n", | |
| " if not names:\n", | |
| " raise Exception(\"No names provided\")\n", | |
| " elif names[0] == \"payments\":\n", | |
| " return ar_payments\n", | |
| " else:\n", | |
| " raise Exception(\"Unrecognized table name\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "ArrowNotImplementedError", | |
| "evalue": "substrait::ReadRel::projection", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mArrowNotImplementedError\u001b[0m Traceback (most recent call last)", | |
| "Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m buf \u001b[38;5;241m=\u001b[39m pa\u001b[38;5;241m.\u001b[39m_substrait\u001b[38;5;241m.\u001b[39m_parse_json_plan(tobytes(plan_str))\n\u001b[0;32m----> 2\u001b[0m reader \u001b[38;5;241m=\u001b[39m \u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msubstrait\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbuf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_provider\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_provider\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m reader\u001b[38;5;241m.\u001b[39mread_all()\n", | |
| "File \u001b[0;32m~/Developer/query-plan-diff/.venv/lib/python3.11/site-packages/pyarrow/_substrait.pyx:156\u001b[0m, in \u001b[0;36mpyarrow._substrait.run_query\u001b[0;34m()\u001b[0m\n", | |
| "File \u001b[0;32m~/Developer/query-plan-diff/.venv/lib/python3.11/site-packages/pyarrow/error.pxi:155\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[0;34m()\u001b[0m\n", | |
| "File \u001b[0;32m~/Developer/query-plan-diff/.venv/lib/python3.11/site-packages/pyarrow/error.pxi:92\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[0;34m()\u001b[0m\n", | |
| "\u001b[0;31mArrowNotImplementedError\u001b[0m: substrait::ReadRel::projection" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "buf = pa._substrait._parse_json_plan(tobytes(plan_str))\n", | |
| "reader = pa.substrait.run_query(buf, table_provider=table_provider)\n", | |
| "reader.read_all()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## V2 Ibis -> Substrait plan -> DuckDB & Arrow" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import ibis\n", | |
| "\n", | |
| "from ibis_substrait.compiler.core import SubstraitCompiler" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "unit_price: int32\n", | |
| "quantity: int32" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pay_schema = con.table('payments').arrow().schema #\n", | |
| "pay_schema" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Define an abstract table (this example uses substrait without it being connected to a database)\n", | |
| "abstract_payments = (\n", | |
| " ibis.table(pay_schema, name='payments')\n", | |
| " .unbind() # Unbinding allows ibis to generate substrait without a catalog\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Define your query using ibis (not ibis-substrait)\n", | |
| "\n", | |
| "# I put this expression here for readability, but it can be in-lined in the query\n", | |
| "total_cost_expr = abstract_payments.unit_price * abstract_payments.quantity" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# This is the actual query, `select` is like \"SELECT\" and we give it an expression\n", | |
| "sample_query = (\n", | |
| " abstract_payments.select(\n", | |
| " (total_cost_expr).name('total_cost')\n", | |
| " )\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/markdown": [ | |
| "```sql\n", | |
| "SELECT\n", | |
| " \"t0\".\"unit_price\" * \"t0\".\"quantity\" AS \"total_cost\"\n", | |
| "FROM \"payments\" AS \"t0\"\n", | |
| "```" | |
| ], | |
| "text/plain": [ | |
| "\u001b[34mSELECT\u001b[39;49;00m\u001b[37m\u001b[39;49;00m\n", | |
| "\u001b[37m \u001b[39;49;00m\u001b[33m\"t0\"\u001b[39;49;00m.\u001b[33m\"unit_price\"\u001b[39;49;00m\u001b[37m \u001b[39;49;00m*\u001b[37m \u001b[39;49;00m\u001b[33m\"t0\"\u001b[39;49;00m.\u001b[33m\"quantity\"\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[34mAS\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[33m\"total_cost\"\u001b[39;49;00m\u001b[37m\u001b[39;49;00m\n", | |
| "\u001b[34mFROM\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[33m\"payments\"\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[34mAS\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[33m\"t0\"\u001b[39;49;00m\u001b[37m\u001b[39;49;00m" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# If you want to see the SQL\n", | |
| "ibis.to_sql(sample_query)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "extension_uris {\n", | |
| " extension_uri_anchor: 1\n", | |
| " uri: \"https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml\"\n", | |
| "}\n", | |
| "extensions {\n", | |
| " extension_function {\n", | |
| " extension_uri_reference: 1\n", | |
| " function_anchor: 1\n", | |
| " name: \"multiply:i32_i32\"\n", | |
| " }\n", | |
| "}\n", | |
| "relations {\n", | |
| " root {\n", | |
| " input {\n", | |
| " project {\n", | |
| " common {\n", | |
| " emit {\n", | |
| " output_mapping: 2\n", | |
| " }\n", | |
| " }\n", | |
| " input {\n", | |
| " read {\n", | |
| " common {\n", | |
| " direct {\n", | |
| " }\n", | |
| " }\n", | |
| " base_schema {\n", | |
| " names: \"unit_price\"\n", | |
| " names: \"quantity\"\n", | |
| " struct {\n", | |
| " types {\n", | |
| " i32 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i32 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " nullability: NULLABILITY_REQUIRED\n", | |
| " }\n", | |
| " }\n", | |
| " named_table {\n", | |
| " names: \"payments\"\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " scalar_function {\n", | |
| " function_reference: 1\n", | |
| " output_type {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " arguments {\n", | |
| " value {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " arguments {\n", | |
| " value {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 1\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " names: \"total_cost\"\n", | |
| " }\n", | |
| "}\n", | |
| "version {\n", | |
| " minor_number: 56\n", | |
| " producer: \"ibis-substrait\"\n", | |
| "}" | |
| ] | |
| }, | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Ibis-substrait\n", | |
| "# This is a \"Plan\" message.\n", | |
| "# The protobuf description: https://github.com/substrait-io/substrait/blob/main/proto/substrait/plan.proto#L26\n", | |
| "# The API for any Message type: https://googleapis.dev/python/protobuf/latest/google/protobuf/message.html\n", | |
| "proto_msg = SubstraitCompiler().compile(sample_query)\n", | |
| "proto_msg" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# this gets us a string (see the protobuf Message API link above)\n", | |
| "# I believe you can pass this along for execution\n", | |
| "serialized_msg = proto_msg.SerializeToString()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>total_cost</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>345600</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " total_cost\n", | |
| "0 10000\n", | |
| "1 10000\n", | |
| "2 345600" | |
| ] | |
| }, | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# duckdb\n", | |
| "con.from_substrait(proto=serialized_msg).df()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>total_cost</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>345600</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " total_cost\n", | |
| "0 10000\n", | |
| "1 10000\n", | |
| "2 345600" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# pyarrow\n", | |
| "pa.substrait.run_query(serialized_msg, table_provider=table_provider).read_all().to_pandas()\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## V3 Ibis -> Substrait plan -> Datafusion" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from datafusion import SessionContext\n", | |
| "from datafusion import substrait as ss" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "ctx = SessionContext()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table border='1'>\n", | |
| "</table>\n" | |
| ], | |
| "text/plain": [ | |
| "DataFrame()\n", | |
| "++\n", | |
| "++" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ctx.sql('drop table if exists payments')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table border='1'>\n", | |
| "</table>\n" | |
| ], | |
| "text/plain": [ | |
| "DataFrame()\n", | |
| "++\n", | |
| "++" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ctx.sql(queries[\"create\"])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table border='1'>\n", | |
| "<tr><th>count</td></tr>\n", | |
| "<tr><td>3</td></tr>\n", | |
| "</table>\n" | |
| ], | |
| "text/plain": [ | |
| "DataFrame()\n", | |
| "+-------+\n", | |
| "| count |\n", | |
| "+-------+\n", | |
| "| 3 |\n", | |
| "+-------+" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ctx.sql(queries[\"insert\"])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>unit_price</th>\n", | |
| " <th>quantity</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>10</td>\n", | |
| " <td>1000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>5</td>\n", | |
| " <td>2000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3456</td>\n", | |
| " <td>100</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>10</td>\n", | |
| " <td>1000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5</td>\n", | |
| " <td>2000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>3456</td>\n", | |
| " <td>100</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " unit_price quantity\n", | |
| "0 10 1000\n", | |
| "1 5 2000\n", | |
| "2 3456 100\n", | |
| "3 10 1000\n", | |
| "4 5 2000\n", | |
| "5 3456 100" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ctx.sql(queries[\"select\"]).to_pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\x12\\x12\\x1a\\x10\\x08\\xff\\xff\\xff\\xff\\x0f\\x1a\\x08multiply\\x1as\\x12q\\nc:a\\n\\x05\\x12\\x03\\n\\x01\\x02\\x12>\\n<\\x12$\\n\\nunit_price\\n\\x08quantity\\x12\\x0c\\n\\x04*\\x02\\x10\\x01\\n\\x04*\\x02\\x10\\x01\"\\x08\\n\\x06\\n\\x00\\n\\x02\\x08\\x01:\\n\\n\\x08payments\\x1a\\x18\\x1a\\x16\"\\x08\\x1a\\x06\\x12\\x04\\n\\x02\\x12\\x00\"\\n\\x1a\\x08\\x12\\x06\\n\\x04\\x12\\x02\\x08\\x01\\x12\\ntotal_cost2\\x0e\\x10:*\\ndatafusion'" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "substrait_plan = ss.Serde.serialize_to_plan(queries[\"transform\"], ctx)\n", | |
| "substrait_plan.encode()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_logical_plan = ss.Consumer.from_substrait_plan(ctx, substrait_plan)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>total_cost</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>345600</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>10000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>345600</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " total_cost\n", | |
| "0 10000\n", | |
| "1 10000\n", | |
| "2 345600\n", | |
| "3 10000\n", | |
| "4 10000\n", | |
| "5 345600" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "ctx.create_dataframe_from_logical_plan(df_logical_plan).to_pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment