Skip to content

Instantly share code, notes, and snippets.

@tswast
Created December 13, 2024 17:19
Show Gist options
  • Save tswast/4e2fb2cca1c1fecf8fb697e94102358f to your computer and use it in GitHub Desktop.
Save tswast/4e2fb2cca1c1fecf8fb697e94102358f to your computer and use it in GitHub Desktop.
notebooks demonstrating bigquery and polars integration without pyarrow
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2024 Google LLC\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook demonstrates writing a polars DataFrame to a BigQuery table without the pyarrow package."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARNING: Skipping pyarrow as it is not installed.\u001b[0m\u001b[33m\n",
"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip uninstall pyarrow -y"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.16.0'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import polars\n",
"\n",
"polars.__version__"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (3, 2)\n",
"┌─────┬────────────┐\n",
"│ id ┆ linked_ids │\n",
"│ --- ┆ --- │\n",
"│ i64 ┆ list[i64] │\n",
"╞═════╪════════════╡\n",
"│ 1 ┆ [1, 2] │\n",
"│ 2 ┆ [3, 4] │\n",
"│ 3 ┆ [5, 6] │\n",
"└─────┴────────────┘\n"
]
}
],
"source": [
"import polars as pl\n",
"\n",
"df = pl.DataFrame(\n",
" {\n",
" \"id\": [1, 2, 3],\n",
" \"linked_ids\": [[1, 2], [3, 4], [5, 6]],\n",
" }\n",
")\n",
"\n",
"print(df)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df.write_parquet(\"linked_ids-polars.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
" SchemaField('linked_ids', 'RECORD', 'NULLABLE', None, None, (SchemaField('list', 'RECORD', 'REPEATED', None, None, (SchemaField('element', 'INTEGER', 'NULLABLE', None, None, (), None),), None),), None)]\n",
"[Row((1, {'list': [{'element': 1}, {'element': 2}]}), {'id': 0, 'linked_ids': 1}),\n",
" Row((2, {'list': [{'element': 3}, {'element': 4}]}), {'id': 0, 'linked_ids': 1}),\n",
" Row((3, {'list': [{'element': 5}, {'element': 6}]}), {'id': 0, 'linked_ids': 1})]\n"
]
}
],
"source": [
"import pprint\n",
"\n",
"from google.cloud import bigquery\n",
"\n",
"destination = \"swast-scratch.my_dataset.google_cloud_bigquery_issue2008_polars\"\n",
"bqclient = bigquery.Client(project=\"swast-scratch\")\n",
"\n",
"job_config = bigquery.LoadJobConfig(\n",
" source_format=bigquery.SourceFormat.PARQUET,\n",
" write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
")\n",
"\n",
"with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
" job = bqclient.load_table_from_file(\n",
" stream,\n",
" destination=destination,\n",
" rewind=True,\n",
" job_config=job_config,\n",
" )\n",
"\n",
" job.result()\n",
"\n",
"table = bqclient.get_table(destination)\n",
"pprint.pprint(table.schema)\n",
"\n",
"rows = bqclient.list_rows(destination)\n",
"pprint.pprint(list(rows))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
" SchemaField('linked_ids', 'INTEGER', 'REPEATED', None, None, (), None)]\n",
"[Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
" Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
" Row((3, [5, 6]), {'id': 0, 'linked_ids': 1})]\n"
]
}
],
"source": [
"parquet_options = bigquery.ParquetOptions()\n",
"parquet_options.enable_list_inference = True\n",
"job_config = bigquery.LoadJobConfig(\n",
" source_format=bigquery.SourceFormat.PARQUET,\n",
" write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
" parquet_options=parquet_options,\n",
")\n",
"\n",
"with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
" job = bqclient.load_table_from_file(\n",
" stream,\n",
" destination=destination,\n",
" rewind=True,\n",
" job_config=job_config,\n",
" )\n",
"\n",
" job.result()\n",
"\n",
"table = bqclient.get_table(destination)\n",
"pprint.pprint(table.schema)\n",
"\n",
"rows = bqclient.list_rows(destination)\n",
"pprint.pprint(list(rows))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
" SchemaField('linked_ids', 'INTEGER', 'REPEATED', None, None, (), None)]\n",
"[Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
" Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
" Row((3, [5, 6]), {'id': 0, 'linked_ids': 1}),\n",
" Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
" Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
" Row((3, [5, 6]), {'id': 0, 'linked_ids': 1})]\n"
]
}
],
"source": [
"# Try with append now that we have the correct schema.\n",
"parquet_options = bigquery.ParquetOptions()\n",
"parquet_options.enable_list_inference = True\n",
"job_config = bigquery.LoadJobConfig(\n",
" source_format=bigquery.SourceFormat.PARQUET,\n",
" write_disposition=bigquery.WriteDisposition.WRITE_APPEND,\n",
" parquet_options=parquet_options,\n",
")\n",
"\n",
"with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
" job = bqclient.load_table_from_file(\n",
" stream,\n",
" destination=destination,\n",
" rewind=True,\n",
" job_config=job_config,\n",
" )\n",
"\n",
" job.result()\n",
"\n",
"table = bqclient.get_table(destination)\n",
"pprint.pprint(table.schema)\n",
"\n",
"rows = bqclient.list_rows(destination)\n",
"pprint.pprint(list(rows))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pyarrow'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyarrow'"
]
}
],
"source": [
"import pyarrow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "scratch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment