tswast · December 13, 2024 17:19
diff --git a/polars-to-bigquery-no-pyarrow.ipynb b/polars-to-bigquery-no-pyarrow.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2024 Google LLC\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     https://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook demonstrates writing a polars DataFrame to a BigQuery table without the pyarrow package."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARNING: Skipping pyarrow as it is not installed.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip uninstall pyarrow -y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.16.0'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars\n",
    "\n",
    "polars.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "shape: (3, 2)\n",
      "┌─────┬────────────┐\n",
      "│ id  ┆ linked_ids │\n",
      "│ --- ┆ ---        │\n",
      "│ i64 ┆ list[i64]  │\n",
      "╞═════╪════════════╡\n",
      "│ 1   ┆ [1, 2]     │\n",
      "│ 2   ┆ [3, 4]     │\n",
      "│ 3   ┆ [5, 6]     │\n",
      "└─────┴────────────┘\n"
     ]
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "df = pl.DataFrame(\n",
    "    {\n",
    "        \"id\": [1, 2, 3],\n",
    "        \"linked_ids\": [[1, 2], [3, 4], [5, 6]],\n",
    "    }\n",
    ")\n",
    "\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.write_parquet(\"linked_ids-polars.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
      " SchemaField('linked_ids', 'RECORD', 'NULLABLE', None, None, (SchemaField('list', 'RECORD', 'REPEATED', None, None, (SchemaField('element', 'INTEGER', 'NULLABLE', None, None, (), None),), None),), None)]\n",
      "[Row((1, {'list': [{'element': 1}, {'element': 2}]}), {'id': 0, 'linked_ids': 1}),\n",
      " Row((2, {'list': [{'element': 3}, {'element': 4}]}), {'id': 0, 'linked_ids': 1}),\n",
      " Row((3, {'list': [{'element': 5}, {'element': 6}]}), {'id': 0, 'linked_ids': 1})]\n"
     ]
    }
   ],
   "source": [
    "import pprint\n",
    "\n",
    "from google.cloud import bigquery\n",
    "\n",
    "destination = \"swast-scratch.my_dataset.google_cloud_bigquery_issue2008_polars\"\n",
    "bqclient = bigquery.Client(project=\"swast-scratch\")\n",
    "\n",
    "job_config = bigquery.LoadJobConfig(\n",
    "    source_format=bigquery.SourceFormat.PARQUET,\n",
    "    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
    ")\n",
    "\n",
    "with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
    "    job = bqclient.load_table_from_file(\n",
    "        stream,\n",
    "        destination=destination,\n",
    "        rewind=True,\n",
    "        job_config=job_config,\n",
    "    )\n",
    "\n",
    "    job.result()\n",
    "\n",
    "table = bqclient.get_table(destination)\n",
    "pprint.pprint(table.schema)\n",
    "\n",
    "rows = bqclient.list_rows(destination)\n",
    "pprint.pprint(list(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
      " SchemaField('linked_ids', 'INTEGER', 'REPEATED', None, None, (), None)]\n",
      "[Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((3, [5, 6]), {'id': 0, 'linked_ids': 1})]\n"
     ]
    }
   ],
   "source": [
    "parquet_options = bigquery.ParquetOptions()\n",
    "parquet_options.enable_list_inference = True\n",
    "job_config = bigquery.LoadJobConfig(\n",
    "    source_format=bigquery.SourceFormat.PARQUET,\n",
    "    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
    "    parquet_options=parquet_options,\n",
    ")\n",
    "\n",
    "with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
    "    job = bqclient.load_table_from_file(\n",
    "        stream,\n",
    "        destination=destination,\n",
    "        rewind=True,\n",
    "        job_config=job_config,\n",
    "    )\n",
    "\n",
    "    job.result()\n",
    "\n",
    "table = bqclient.get_table(destination)\n",
    "pprint.pprint(table.schema)\n",
    "\n",
    "rows = bqclient.list_rows(destination)\n",
    "pprint.pprint(list(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
      " SchemaField('linked_ids', 'INTEGER', 'REPEATED', None, None, (), None)]\n",
      "[Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((3, [5, 6]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
      " Row((3, [5, 6]), {'id': 0, 'linked_ids': 1})]\n"
     ]
    }
   ],
   "source": [
    "# Try with append now that we have the correct schema.\n",
    "parquet_options = bigquery.ParquetOptions()\n",
    "parquet_options.enable_list_inference = True\n",
    "job_config = bigquery.LoadJobConfig(\n",
    "    source_format=bigquery.SourceFormat.PARQUET,\n",
    "    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,\n",
    "    parquet_options=parquet_options,\n",
    ")\n",
    "\n",
    "with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
    "    job = bqclient.load_table_from_file(\n",
    "        stream,\n",
    "        destination=destination,\n",
    "        rewind=True,\n",
    "        job_config=job_config,\n",
    "    )\n",
    "\n",
    "    job.result()\n",
    "\n",
    "table = bqclient.get_table(destination)\n",
    "pprint.pprint(table.schema)\n",
    "\n",
    "rows = bqclient.list_rows(destination)\n",
    "pprint.pprint(list(rows))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'pyarrow'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyarrow'"
     ]
    }
   ],
   "source": [
    "import pyarrow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "scratch",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Copyright 2024 Google LLC\n",
	"#\n",
	"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
	"# you may not use this file except in compliance with the License.\n",
	"# You may obtain a copy of the License at\n",
	"#\n",
	"# https://www.apache.org/licenses/LICENSE-2.0\n",
	"#\n",
	"# Unless required by applicable law or agreed to in writing, software\n",
	"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
	"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
	"# See the License for the specific language governing permissions and\n",
	"# limitations under the License."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This notebook demonstrates writing a polars DataFrame to a BigQuery table without the pyarrow package."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\u001b[33mWARNING: Skipping pyarrow as it is not installed.\u001b[0m\u001b[33m\n",
	"\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
	]
	}
	],
	"source": [
	"%pip uninstall pyarrow -y"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'1.16.0'"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import polars\n",
	"\n",
	"polars.__version__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"shape: (3, 2)\n",
	"┌─────┬────────────┐\n",
	"│ id ┆ linked_ids │\n",
	"│ --- ┆ --- │\n",
	"│ i64 ┆ list[i64] │\n",
	"╞═════╪════════════╡\n",
	"│ 1 ┆ [1, 2] │\n",
	"│ 2 ┆ [3, 4] │\n",
	"│ 3 ┆ [5, 6] │\n",
	"└─────┴────────────┘\n"
	]
	}
	],
	"source": [
	"import polars as pl\n",
	"\n",
	"df = pl.DataFrame(\n",
	" {\n",
	" \"id\": [1, 2, 3],\n",
	" \"linked_ids\": [[1, 2], [3, 4], [5, 6]],\n",
	" }\n",
	")\n",
	"\n",
	"print(df)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.write_parquet(\"linked_ids-polars.parquet\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
	" SchemaField('linked_ids', 'RECORD', 'NULLABLE', None, None, (SchemaField('list', 'RECORD', 'REPEATED', None, None, (SchemaField('element', 'INTEGER', 'NULLABLE', None, None, (), None),), None),), None)]\n",
	"[Row((1, {'list': [{'element': 1}, {'element': 2}]}), {'id': 0, 'linked_ids': 1}),\n",
	" Row((2, {'list': [{'element': 3}, {'element': 4}]}), {'id': 0, 'linked_ids': 1}),\n",
	" Row((3, {'list': [{'element': 5}, {'element': 6}]}), {'id': 0, 'linked_ids': 1})]\n"
	]
	}
	],
	"source": [
	"import pprint\n",
	"\n",
	"from google.cloud import bigquery\n",
	"\n",
	"destination = \"swast-scratch.my_dataset.google_cloud_bigquery_issue2008_polars\"\n",
	"bqclient = bigquery.Client(project=\"swast-scratch\")\n",
	"\n",
	"job_config = bigquery.LoadJobConfig(\n",
	" source_format=bigquery.SourceFormat.PARQUET,\n",
	" write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
	")\n",
	"\n",
	"with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
	" job = bqclient.load_table_from_file(\n",
	" stream,\n",
	" destination=destination,\n",
	" rewind=True,\n",
	" job_config=job_config,\n",
	" )\n",
	"\n",
	" job.result()\n",
	"\n",
	"table = bqclient.get_table(destination)\n",
	"pprint.pprint(table.schema)\n",
	"\n",
	"rows = bqclient.list_rows(destination)\n",
	"pprint.pprint(list(rows))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
	" SchemaField('linked_ids', 'INTEGER', 'REPEATED', None, None, (), None)]\n",
	"[Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((3, [5, 6]), {'id': 0, 'linked_ids': 1})]\n"
	]
	}
	],
	"source": [
	"parquet_options = bigquery.ParquetOptions()\n",
	"parquet_options.enable_list_inference = True\n",
	"job_config = bigquery.LoadJobConfig(\n",
	" source_format=bigquery.SourceFormat.PARQUET,\n",
	" write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,\n",
	" parquet_options=parquet_options,\n",
	")\n",
	"\n",
	"with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
	" job = bqclient.load_table_from_file(\n",
	" stream,\n",
	" destination=destination,\n",
	" rewind=True,\n",
	" job_config=job_config,\n",
	" )\n",
	"\n",
	" job.result()\n",
	"\n",
	"table = bqclient.get_table(destination)\n",
	"pprint.pprint(table.schema)\n",
	"\n",
	"rows = bqclient.list_rows(destination)\n",
	"pprint.pprint(list(rows))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[SchemaField('id', 'INTEGER', 'NULLABLE', None, None, (), None),\n",
	" SchemaField('linked_ids', 'INTEGER', 'REPEATED', None, None, (), None)]\n",
	"[Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((3, [5, 6]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((1, [1, 2]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((2, [3, 4]), {'id': 0, 'linked_ids': 1}),\n",
	" Row((3, [5, 6]), {'id': 0, 'linked_ids': 1})]\n"
	]
	}
	],
	"source": [
	"# Try with append now that we have the correct schema.\n",
	"parquet_options = bigquery.ParquetOptions()\n",
	"parquet_options.enable_list_inference = True\n",
	"job_config = bigquery.LoadJobConfig(\n",
	" source_format=bigquery.SourceFormat.PARQUET,\n",
	" write_disposition=bigquery.WriteDisposition.WRITE_APPEND,\n",
	" parquet_options=parquet_options,\n",
	")\n",
	"\n",
	"with open(\"linked_ids-polars.parquet\", \"rb\") as stream:\n",
	" job = bqclient.load_table_from_file(\n",
	" stream,\n",
	" destination=destination,\n",
	" rewind=True,\n",
	" job_config=job_config,\n",
	" )\n",
	"\n",
	" job.result()\n",
	"\n",
	"table = bqclient.get_table(destination)\n",
	"pprint.pprint(table.schema)\n",
	"\n",
	"rows = bqclient.list_rows(destination)\n",
	"pprint.pprint(list(rows))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"ename": "ModuleNotFoundError",
	"evalue": "No module named 'pyarrow'",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
	"Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\n",
	"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyarrow'"
	]
	}
	],
	"source": [
	"import pyarrow"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "scratch",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}