Last active
December 19, 2024 21:55
-
-
Save dataders/ef982a2e6eae5d125705717fcce80111 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"extensionUris": [ | |
{ | |
"extensionUriAnchor": 1, | |
"uri": "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml" | |
} | |
], | |
"extensions": [ | |
{ | |
"extensionFunction": { | |
"extensionUriReference": 1, | |
"functionAnchor": 1, | |
"name": "multiply:i32_i32" | |
} | |
} | |
], | |
"relations": [ | |
{ | |
"root": { | |
"input": { | |
"project": { | |
"input": { | |
"read": { | |
"baseSchema": { | |
"names": [ | |
"unit_price", | |
"quantity" | |
], | |
"struct": { | |
"types": [ | |
{ | |
"i32": { | |
"nullability": "NULLABILITY_NULLABLE" | |
} | |
}, | |
{ | |
"i32": { | |
"nullability": "NULLABILITY_NULLABLE" | |
} | |
} | |
], | |
"nullability": "NULLABILITY_REQUIRED" | |
} | |
}, | |
"projection": { | |
"select": { | |
"structItems": [ | |
{}, | |
{ | |
"field": 1 | |
} | |
] | |
}, | |
"maintainSingularStruct": true | |
}, | |
"namedTable": { | |
"names": [ | |
"payments" | |
] | |
} | |
} | |
}, | |
"expressions": [ | |
{ | |
"scalarFunction": { | |
"functionReference": 1, | |
"outputType": { | |
"i32": { | |
"nullability": "NULLABILITY_NULLABLE" | |
} | |
}, | |
"arguments": [ | |
{ | |
"value": { | |
"selection": { | |
"directReference": { | |
"structField": {} | |
}, | |
"rootReference": {} | |
} | |
} | |
}, | |
{ | |
"value": { | |
"selection": { | |
"directReference": { | |
"structField": { | |
"field": 1 | |
} | |
}, | |
"rootReference": {} | |
} | |
} | |
} | |
] | |
} | |
} | |
] | |
} | |
}, | |
"names": [ | |
"total_cost" | |
] | |
} | |
} | |
], | |
"version": { | |
"minorNumber": 53, | |
"producer": "DuckDB" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# reprex: duckdb -> pyarrow\n", | |
"\n", | |
"trying to use a DuckDB-genrated subait plan on a PyArrow table, but it fails with the following error:\n", | |
"\n", | |
"```py\n", | |
"ArrowNotImplementedError: substrait::ReadRel::projection\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import duckdb\n", | |
"import json\n", | |
"import pyarrow as pa\n", | |
"from pyarrow.lib import tobytes\n", | |
"import pyarrow.substrait as substrait" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"con = duckdb.connect()\n", | |
"# con = duckdb.connect('sandbox_substrait.db')\n", | |
"con.install_extension(\"substrait\")\n", | |
"con.load_extension(\"substrait\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## v1 DuckDB query -> Substrait plan -> Arrow table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"queries = {\n", | |
" \"create\": \"CREATE OR REPLACE TABLE payments (unit_price int, quantity int);\",\n", | |
" \"insert\": \"INSERT INTO payments VALUES (10, 1000), (5,2000), (3456,100);\",\n", | |
" \"select\": \"select * from payments;\",\n", | |
" \"transform\": \"select unit_price * quantity as total_cost from payments;\"\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pyarrow.Table\n", | |
"unit_price: int32\n", | |
"quantity: int32\n", | |
"----\n", | |
"unit_price: [[10,5,3456]]\n", | |
"quantity: [[1000,2000,100]]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(f'{queries[\"create\"]} {queries[\"insert\"]}')\n", | |
"ar_payments = con.execute(queries['select']).fetch_arrow_table()\n", | |
"ar_payments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pyarrow.Table\n", | |
"explain_key: string\n", | |
"explain_value: string\n", | |
"----\n", | |
"explain_key: [[\"physical_plan\"]]\n", | |
"explain_value: [[\"┌───────────────────────────┐\n", | |
"│ PROJECTION │\n", | |
"│ ──────────────────── │\n", | |
"│ total_cost │\n", | |
"│ │\n", | |
"│ ~3 Rows │\n", | |
"└─────────────┬─────────────┘\n", | |
"┌─────────────┴─────────────┐\n", | |
"│ SEQ_SCAN │\n", | |
"│ ──────────────────── │\n", | |
"│ payments │\n", | |
"│ │\n", | |
"│ Projections: │\n", | |
"│ unit_price │\n", | |
"│ quantity │\n", | |
"│ │\n", | |
"│ ~3 Rows │\n", | |
"└───────────────────────────┘\n", | |
"\"]]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"con.execute(f\"EXPLAIN {queries['transform']}\").fetch_arrow_table()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'extensionUris': [{'extensionUriAnchor': 1,\n", | |
" 'uri': 'https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml'}],\n", | |
" 'extensions': [{'extensionFunction': {'extensionUriReference': 1,\n", | |
" 'functionAnchor': 1,\n", | |
" 'name': 'multiply:i32_i32'}}],\n", | |
" 'relations': [{'root': {'input': {'project': {'input': {'read': {'baseSchema': {'names': ['unit_price',\n", | |
" 'quantity'],\n", | |
" 'struct': {'types': [{'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
" {'i32': {'nullability': 'NULLABILITY_NULLABLE'}}],\n", | |
" 'nullability': 'NULLABILITY_REQUIRED'}},\n", | |
" 'projection': {'select': {'structItems': [{}, {'field': 1}]},\n", | |
" 'maintainSingularStruct': True},\n", | |
" 'namedTable': {'names': ['payments']}}},\n", | |
" 'expressions': [{'scalarFunction': {'functionReference': 1,\n", | |
" 'outputType': {'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
" 'arguments': [{'value': {'selection': {'directReference': {'structField': {}},\n", | |
" 'rootReference': {}}}},\n", | |
" {'value': {'selection': {'directReference': {'structField': {'field': 1}},\n", | |
" 'rootReference': {}}}}]}}]}},\n", | |
" 'names': ['total_cost']}}],\n", | |
" 'version': {'minorNumber': 53, 'producer': 'DuckDB'}}" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"plan_str = con.get_substrait_json(query=queries['transform']).fetchone()[0]\n", | |
"plan_json_full = json.loads(plan_str)\n", | |
"plan_json_full" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'project': {'input': {'read': {'baseSchema': {'names': ['unit_price',\n", | |
" 'quantity'],\n", | |
" 'struct': {'types': [{'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
" {'i32': {'nullability': 'NULLABILITY_NULLABLE'}}],\n", | |
" 'nullability': 'NULLABILITY_REQUIRED'}},\n", | |
" 'projection': {'select': {'structItems': [{}, {'field': 1}]},\n", | |
" 'maintainSingularStruct': True},\n", | |
" 'namedTable': {'names': ['payments']}}},\n", | |
" 'expressions': [{'scalarFunction': {'functionReference': 1,\n", | |
" 'outputType': {'i32': {'nullability': 'NULLABILITY_NULLABLE'}},\n", | |
" 'arguments': [{'value': {'selection': {'directReference': {'structField': {}},\n", | |
" 'rootReference': {}}}},\n", | |
" {'value': {'selection': {'directReference': {'structField': {'field': 1}},\n", | |
" 'rootReference': {}}}}]}}]}}" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# plan_json_full['relations'][0]['root'].keys()\n", | |
"plan_json_full['relations'][0]['root']['input']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def table_provider(names, schema):\n", | |
" if not names:\n", | |
" raise Exception(\"No names provided\")\n", | |
" elif names[0] == \"payments\":\n", | |
" return ar_payments\n", | |
" else:\n", | |
" raise Exception(\"Unrecognized table name\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "ArrowNotImplementedError", | |
"evalue": "substrait::ReadRel::projection", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mArrowNotImplementedError\u001b[0m Traceback (most recent call last)", | |
"Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m buf \u001b[38;5;241m=\u001b[39m pa\u001b[38;5;241m.\u001b[39m_substrait\u001b[38;5;241m.\u001b[39m_parse_json_plan(tobytes(plan_str))\n\u001b[0;32m----> 2\u001b[0m reader \u001b[38;5;241m=\u001b[39m \u001b[43mpa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msubstrait\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbuf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_provider\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_provider\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m reader\u001b[38;5;241m.\u001b[39mread_all()\n", | |
"File \u001b[0;32m~/Developer/query-plan-diff/.venv/lib/python3.11/site-packages/pyarrow/_substrait.pyx:156\u001b[0m, in \u001b[0;36mpyarrow._substrait.run_query\u001b[0;34m()\u001b[0m\n", | |
"File \u001b[0;32m~/Developer/query-plan-diff/.venv/lib/python3.11/site-packages/pyarrow/error.pxi:155\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[0;34m()\u001b[0m\n", | |
"File \u001b[0;32m~/Developer/query-plan-diff/.venv/lib/python3.11/site-packages/pyarrow/error.pxi:92\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[0;34m()\u001b[0m\n", | |
"\u001b[0;31mArrowNotImplementedError\u001b[0m: substrait::ReadRel::projection" | |
] | |
} | |
], | |
"source": [ | |
"buf = pa._substrait._parse_json_plan(tobytes(plan_str))\n", | |
"reader = pa.substrait.run_query(buf, table_provider=table_provider)\n", | |
"reader.read_all()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## V2 Ibis -> Substrait plan -> DuckDB & Arrow" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import ibis\n", | |
"\n", | |
"from ibis_substrait.compiler.core import SubstraitCompiler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"unit_price: int32\n", | |
"quantity: int32" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pay_schema = con.table('payments').arrow().schema #\n", | |
"pay_schema" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define an abstract table (this example uses substrait without it being connected to a database)\n", | |
"abstract_payments = (\n", | |
" ibis.table(pay_schema, name='payments')\n", | |
" .unbind() # Unbinding allows ibis to generate substrait without a catalog\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define your query using ibis (not ibis-substrait)\n", | |
"\n", | |
"# I put this expression here for readability, but it can be in-lined in the query\n", | |
"total_cost_expr = abstract_payments.unit_price * abstract_payments.quantity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# This is the actual query, `select` is like \"SELECT\" and we give it an expression\n", | |
"sample_query = (\n", | |
" abstract_payments.select(\n", | |
" (total_cost_expr).name('total_cost')\n", | |
" )\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/markdown": [ | |
"```sql\n", | |
"SELECT\n", | |
" \"t0\".\"unit_price\" * \"t0\".\"quantity\" AS \"total_cost\"\n", | |
"FROM \"payments\" AS \"t0\"\n", | |
"```" | |
], | |
"text/plain": [ | |
"\u001b[34mSELECT\u001b[39;49;00m\u001b[37m\u001b[39;49;00m\n", | |
"\u001b[37m \u001b[39;49;00m\u001b[33m\"t0\"\u001b[39;49;00m.\u001b[33m\"unit_price\"\u001b[39;49;00m\u001b[37m \u001b[39;49;00m*\u001b[37m \u001b[39;49;00m\u001b[33m\"t0\"\u001b[39;49;00m.\u001b[33m\"quantity\"\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[34mAS\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[33m\"total_cost\"\u001b[39;49;00m\u001b[37m\u001b[39;49;00m\n", | |
"\u001b[34mFROM\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[33m\"payments\"\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[34mAS\u001b[39;49;00m\u001b[37m \u001b[39;49;00m\u001b[33m\"t0\"\u001b[39;49;00m\u001b[37m\u001b[39;49;00m" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# If you want to see the SQL\n", | |
"ibis.to_sql(sample_query)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"extension_uris {\n", | |
" extension_uri_anchor: 1\n", | |
" uri: \"https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml\"\n", | |
"}\n", | |
"extensions {\n", | |
" extension_function {\n", | |
" extension_uri_reference: 1\n", | |
" function_anchor: 1\n", | |
" name: \"multiply:i32_i32\"\n", | |
" }\n", | |
"}\n", | |
"relations {\n", | |
" root {\n", | |
" input {\n", | |
" project {\n", | |
" common {\n", | |
" emit {\n", | |
" output_mapping: 2\n", | |
" }\n", | |
" }\n", | |
" input {\n", | |
" read {\n", | |
" common {\n", | |
" direct {\n", | |
" }\n", | |
" }\n", | |
" base_schema {\n", | |
" names: \"unit_price\"\n", | |
" names: \"quantity\"\n", | |
" struct {\n", | |
" types {\n", | |
" i32 {\n", | |
" nullability: NULLABILITY_NULLABLE\n", | |
" }\n", | |
" }\n", | |
" types {\n", | |
" i32 {\n", | |
" nullability: NULLABILITY_NULLABLE\n", | |
" }\n", | |
" }\n", | |
" nullability: NULLABILITY_REQUIRED\n", | |
" }\n", | |
" }\n", | |
" named_table {\n", | |
" names: \"payments\"\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" expressions {\n", | |
" scalar_function {\n", | |
" function_reference: 1\n", | |
" output_type {\n", | |
" i64 {\n", | |
" nullability: NULLABILITY_NULLABLE\n", | |
" }\n", | |
" }\n", | |
" arguments {\n", | |
" value {\n", | |
" selection {\n", | |
" direct_reference {\n", | |
" struct_field {\n", | |
" }\n", | |
" }\n", | |
" root_reference {\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" arguments {\n", | |
" value {\n", | |
" selection {\n", | |
" direct_reference {\n", | |
" struct_field {\n", | |
" field: 1\n", | |
" }\n", | |
" }\n", | |
" root_reference {\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" names: \"total_cost\"\n", | |
" }\n", | |
"}\n", | |
"version {\n", | |
" minor_number: 56\n", | |
" producer: \"ibis-substrait\"\n", | |
"}" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Ibis-substrait\n", | |
"# This is a \"Plan\" message.\n", | |
"# The protobuf description: https://github.com/substrait-io/substrait/blob/main/proto/substrait/plan.proto#L26\n", | |
"# The API for any Message type: https://googleapis.dev/python/protobuf/latest/google/protobuf/message.html\n", | |
"proto_msg = SubstraitCompiler().compile(sample_query)\n", | |
"proto_msg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# this gets us a string (see the protobuf Message API link above)\n", | |
"# I believe you can pass this along for execution\n", | |
"serialized_msg = proto_msg.SerializeToString()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>total_cost</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>345600</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" total_cost\n", | |
"0 10000\n", | |
"1 10000\n", | |
"2 345600" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# duckdb\n", | |
"con.from_substrait(proto=serialized_msg).df()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>total_cost</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>345600</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" total_cost\n", | |
"0 10000\n", | |
"1 10000\n", | |
"2 345600" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# pyarrow\n", | |
"pa.substrait.run_query(serialized_msg, table_provider=table_provider).read_all().to_pandas()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## V3 Ibis -> Substrait plan -> Datafusion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from datafusion import SessionContext\n", | |
"from datafusion import substrait as ss" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ctx = SessionContext()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table border='1'>\n", | |
"</table>\n" | |
], | |
"text/plain": [ | |
"DataFrame()\n", | |
"++\n", | |
"++" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ctx.sql('drop table if exists payments')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table border='1'>\n", | |
"</table>\n" | |
], | |
"text/plain": [ | |
"DataFrame()\n", | |
"++\n", | |
"++" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ctx.sql(queries[\"create\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table border='1'>\n", | |
"<tr><th>count</td></tr>\n", | |
"<tr><td>3</td></tr>\n", | |
"</table>\n" | |
], | |
"text/plain": [ | |
"DataFrame()\n", | |
"+-------+\n", | |
"| count |\n", | |
"+-------+\n", | |
"| 3 |\n", | |
"+-------+" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ctx.sql(queries[\"insert\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>unit_price</th>\n", | |
" <th>quantity</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10</td>\n", | |
" <td>1000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5</td>\n", | |
" <td>2000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3456</td>\n", | |
" <td>100</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10</td>\n", | |
" <td>1000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>2000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>3456</td>\n", | |
" <td>100</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" unit_price quantity\n", | |
"0 10 1000\n", | |
"1 5 2000\n", | |
"2 3456 100\n", | |
"3 10 1000\n", | |
"4 5 2000\n", | |
"5 3456 100" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ctx.sql(queries[\"select\"]).to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\x12\\x12\\x1a\\x10\\x08\\xff\\xff\\xff\\xff\\x0f\\x1a\\x08multiply\\x1as\\x12q\\nc:a\\n\\x05\\x12\\x03\\n\\x01\\x02\\x12>\\n<\\x12$\\n\\nunit_price\\n\\x08quantity\\x12\\x0c\\n\\x04*\\x02\\x10\\x01\\n\\x04*\\x02\\x10\\x01\"\\x08\\n\\x06\\n\\x00\\n\\x02\\x08\\x01:\\n\\n\\x08payments\\x1a\\x18\\x1a\\x16\"\\x08\\x1a\\x06\\x12\\x04\\n\\x02\\x12\\x00\"\\n\\x1a\\x08\\x12\\x06\\n\\x04\\x12\\x02\\x08\\x01\\x12\\ntotal_cost2\\x0e\\x10:*\\ndatafusion'" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"substrait_plan = ss.Serde.serialize_to_plan(queries[\"transform\"], ctx)\n", | |
"substrait_plan.encode()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_logical_plan = ss.Consumer.from_substrait_plan(ctx, substrait_plan)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>total_cost</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>345600</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>10000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>345600</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" total_cost\n", | |
"0 10000\n", | |
"1 10000\n", | |
"2 345600\n", | |
"3 10000\n", | |
"4 10000\n", | |
"5 345600" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ctx.create_dataframe_from_logical_plan(df_logical_plan).to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment