Skip to content

Instantly share code, notes, and snippets.

@tswast
Created September 6, 2024 21:16
Show Gist options
  • Save tswast/99b017b20386e324f5c7d2bd49f21b5f to your computer and use it in GitHub Desktop.
Save tswast/99b017b20386e324f5c7d2bd49f21b5f to your computer and use it in GitHub Desktop.
notebooks demonstrating bigquery and polars integration without pyarrow
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2024 Google LLC\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARNING: Skipping pyarrow as it is not installed.\u001b[0m\u001b[33m\n",
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip uninstall pyarrow -y"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import google.cloud.bigquery_storage_v1\n",
"import polars"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"bqread = google.cloud.bigquery_storage_v1.BigQueryReadClient()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"read_request = google.cloud.bigquery_storage_v1.types.CreateReadSessionRequest()\n",
"read_session = google.cloud.bigquery_storage_v1.types.ReadSession()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"read_session.table = \"projects/swast-scratch/datasets/my_dataset/tables/my_table\" # table to read\n",
"read_session.data_format = google.cloud.bigquery_storage_v1.types.DataFormat.ARROW\n",
"read_request.parent = \"projects/swast-scratch\" # billing project\n",
"read_request.read_session = read_session\n",
"read_request.max_stream_count = 1 # single-threaded for proof-of-concept"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"session = bqread.create_read_session(read_request)\n",
"reader = bqread.read_rows(session.streams[0].name) # Note: streams could be empty if the table is empty.\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"\n",
"fake_stream = io.BytesIO()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# IPC format is schema followed by record batches\n",
"# https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format\n",
"fake_stream.write(session.arrow_schema.serialized_schema)\n",
"for message in reader:\n",
" fake_stream.write(message.arrow_record_batch.serialized_record_batch)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_stream.seek(0)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df = polars.read_ipc_stream(fake_stream)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (3, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>letter</th><th>ts</th><th>my_string_col</th><th>bool_col</th><th>int64_col</th></tr><tr><td>str</td><td>datetime[μs, UTC]</td><td>str</td><td>bool</td><td>i64</td></tr></thead><tbody><tr><td>&quot;a&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;c&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>null</td><td>2020-11-19 11:01:17.123 UTC</td><td>null</td><td>null</td><td>null</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (3, 5)\n",
"┌────────┬─────────────────────────────┬───────────────┬──────────┬───────────┐\n",
"│ letter ┆ ts ┆ my_string_col ┆ bool_col ┆ int64_col │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ datetime[μs, UTC] ┆ str ┆ bool ┆ i64 │\n",
"╞════════╪═════════════════════════════╪═══════════════╪══════════╪═══════════╡\n",
"│ a ┆ null ┆ null ┆ null ┆ null │\n",
"│ c ┆ null ┆ null ┆ null ┆ null │\n",
"│ null ┆ 2020-11-19 11:01:17.123 UTC ┆ null ┆ null ┆ null │\n",
"└────────┴─────────────────────────────┴───────────────┴──────────┴───────────┘"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pyarrow'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyarrow'"
]
}
],
"source": [
"import pyarrow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment