Last active
January 31, 2025 21:46
-
-
Save cicorias/98e51d9f1da6d74202c390a8ade5e60d to your computer and use it in GitHub Desktop.
Delta Tables in Python with delta-io delta-rs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "id": "c75ab3ab-ab78-4849-a020-b2ca67f85a56", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "3.9.2 (default, Feb 28 2021, 17:03:44) \n", | |
| "[GCC 10.2.1 20210110]\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'3.3.2'" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import sys\n", | |
| "print (sys.version)\n", | |
| "spark.version" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "2f0e8d9f-0461-43ba-a19b-a6de5632e739", | |
| "metadata": {}, | |
| "source": [ | |
| "# https://delta-io.github.io/delta-rs/\n", | |
| "https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants\n", | |
| "https://github.com/Azure/Azurite?tab=readme-ov-file#supported-command-line-options\n", | |
| "https://docs.delta.io/latest/index.html\n", | |
| "https://docs.rs/deltalake/latest/deltalake/\n", | |
| "https://delta-io.github.io/delta-rs/usage/installation/\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "id": "39f3d160-db5f-4ae5-a116-d52c18ada3e9", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from deltalake import write_deltalake, DeltaTable\n", | |
| "\n", | |
| "df = pd.DataFrame({\"foo\": np.random.randint(1, 10, 5), \"bar\": np.random.randint(1, 10, 5)})\n", | |
| "\n", | |
| "\n", | |
| "# define container name\n", | |
| "container = \"data\"\n", | |
| "\n", | |
| "# define credentials\n", | |
| "storage_options = {\n", | |
| " # \"ACCOUNT_NAME\": \"devstoreaccount1\",\n", | |
| " # \"ACCESS_KEY\": \"Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==\",\n", | |
| " \"azure_storage_use_emulator\" : \"true\",\n", | |
| "}\n", | |
| "\n", | |
| "# storage_options = {\n", | |
| "# \"azure_storage_account_name\": \"devstoreaccount1\",\n", | |
| "# \"azure_storage_account_key\": \"Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==\",\n", | |
| "# \"azure_container_name\" : \"data\",\n", | |
| "# # \"azure_storage_use_emulator\" : \"true\",\n", | |
| "# # \"azure_skip_signature\" : \"true\",\n", | |
| "# # \"azure_storage_endpoint\" : \"http://azureite:10000/devstoreaccount1\",\n", | |
| "# }\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "write_deltalake(\n", | |
| " f\"abfs://{container}/delta_table_pandas\",\n", | |
| " df,\n", | |
| " # partition_by=['foo'],\n", | |
| " storage_options=storage_options,\n", | |
| " # schema_mode=\"overwrite\",\n", | |
| " # overwrite_schema=True,\n", | |
| " mode=\"overwrite\",\n", | |
| " overwrite_schema=True,\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "id": "ad12ac57-43a9-4de5-ac6c-4bc257c39a99", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "-----version--------\n", | |
| "6\n", | |
| "----metadata-----\n", | |
| "Metadata(id: 26b16d13-2cb8-4c47-9544-6cd309f03cb1, name: None, description: None, partition_columns: [], created_time: 1738356229969, configuration: {})\n", | |
| "---schema---\n", | |
| "Schema([Field(foo, PrimitiveType(\"long\"), nullable=True), Field(bar, PrimitiveType(\"long\"), nullable=True)])\n", | |
| "-----history-----\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "application/json": [ | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "CREATE TABLE", | |
| "operationParameters": { | |
| "location": "abfs://data/delta_table_pandas", | |
| "metadata": "{\"configuration\":{},\"created_time\":1738356229969,\"description\":null,\"format\":{\"options\":{},\"provider\":\"parquet\"},\"id\":\"26b16d13-2cb8-4c47-9544-6cd309f03cb1\",\"name\":null,\"partition_columns\":[],\"schema\":{\"fields\":[{\"metadata\":{},\"name\":\"foo\",\"nullable\":true,\"type\":\"long\"}],\"type\":\"struct\"}}", | |
| "mode": "ErrorIfExists", | |
| "protocol": "{\"minReaderVersion\":1,\"minWriterVersion\":1}" | |
| }, | |
| "timestamp": 1738356229980 | |
| }, | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "WRITE", | |
| "operationParameters": { | |
| "mode": "Overwrite", | |
| "partitionBy": "[]" | |
| }, | |
| "timestamp": 1738356363664 | |
| }, | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "WRITE", | |
| "operationParameters": { | |
| "mode": "Overwrite", | |
| "partitionBy": "[]" | |
| }, | |
| "timestamp": 1738356686275 | |
| }, | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "WRITE", | |
| "operationParameters": { | |
| "mode": "Overwrite", | |
| "partitionBy": "[]" | |
| }, | |
| "timestamp": 1738357075080 | |
| }, | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "WRITE", | |
| "operationParameters": { | |
| "mode": "Overwrite", | |
| "partitionBy": "[]" | |
| }, | |
| "timestamp": 1738357220986 | |
| }, | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "WRITE", | |
| "operationParameters": { | |
| "mode": "Overwrite", | |
| "partitionBy": "[]" | |
| }, | |
| "timestamp": 1738359571117 | |
| }, | |
| { | |
| "clientVersion": "delta-rs.0.10.0", | |
| "operation": "WRITE", | |
| "operationParameters": { | |
| "mode": "Overwrite", | |
| "partitionBy": "[]" | |
| }, | |
| "timestamp": 1738359696984 | |
| } | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.JSON object>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/json": { | |
| "expanded": false, | |
| "root": "root" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[\n", | |
| " {\n", | |
| " \"timestamp\": 1738356229980,\n", | |
| " \"operation\": \"CREATE TABLE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"mode\": \"ErrorIfExists\",\n", | |
| " \"protocol\": \"{\\\"minReaderVersion\\\":1,\\\"minWriterVersion\\\":1}\",\n", | |
| " \"metadata\": \"{\\\"configuration\\\":{},\\\"created_time\\\":1738356229969,\\\"description\\\":null,\\\"format\\\":{\\\"options\\\":{},\\\"provider\\\":\\\"parquet\\\"},\\\"id\\\":\\\"26b16d13-2cb8-4c47-9544-6cd309f03cb1\\\",\\\"name\\\":null,\\\"partition_columns\\\":[],\\\"schema\\\":{\\\"fields\\\":[{\\\"metadata\\\":{},\\\"name\\\":\\\"foo\\\",\\\"nullable\\\":true,\\\"type\\\":\\\"long\\\"}],\\\"type\\\":\\\"struct\\\"}}\",\n", | |
| " \"location\": \"abfs://data/delta_table_pandas\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"timestamp\": 1738356363664,\n", | |
| " \"operation\": \"WRITE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"partitionBy\": \"[]\",\n", | |
| " \"mode\": \"Overwrite\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"timestamp\": 1738356686275,\n", | |
| " \"operation\": \"WRITE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"mode\": \"Overwrite\",\n", | |
| " \"partitionBy\": \"[]\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"timestamp\": 1738357075080,\n", | |
| " \"operation\": \"WRITE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"mode\": \"Overwrite\",\n", | |
| " \"partitionBy\": \"[]\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"timestamp\": 1738357220986,\n", | |
| " \"operation\": \"WRITE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"partitionBy\": \"[]\",\n", | |
| " \"mode\": \"Overwrite\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"timestamp\": 1738359571117,\n", | |
| " \"operation\": \"WRITE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"partitionBy\": \"[]\",\n", | |
| " \"mode\": \"Overwrite\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " },\n", | |
| " {\n", | |
| " \"timestamp\": 1738359696984,\n", | |
| " \"operation\": \"WRITE\",\n", | |
| " \"operationParameters\": {\n", | |
| " \"partitionBy\": \"[]\",\n", | |
| " \"mode\": \"Overwrite\"\n", | |
| " },\n", | |
| " \"clientVersion\": \"delta-rs.0.10.0\"\n", | |
| " }\n", | |
| "]\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>foo</th>\n", | |
| " <th>bar</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>4</td>\n", | |
| " <td>8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>6</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>6</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>9</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " foo bar\n", | |
| "0 4 8\n", | |
| "1 6 2\n", | |
| "2 6 2\n", | |
| "3 1 5\n", | |
| "4 9 6" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import json\n", | |
| "from IPython.display import display, JSON\n", | |
| "from deltalake import DeltaTable\n", | |
| "\n", | |
| "\n", | |
| "dt = DeltaTable(f\"abfs://{container}/delta_table_pandas\",\n", | |
| " storage_options=storage_options)\n", | |
| "\n", | |
| "pd = dt.to_pandas()\n", | |
| "\n", | |
| "schema_json = dt.schema()\n", | |
| "\n", | |
| "print(\"-----version--------\")\n", | |
| "print(dt.version())\n", | |
| "print(\"----metadata-----\")\n", | |
| "print(dt.metadata())\n", | |
| "print(\"---schema---\")\n", | |
| "# print(json.dumps(dt.schema(), indent=4))\n", | |
| "# display(pd(schema_json[\"fields\"]))\n", | |
| "print(schema_json)\n", | |
| "print(\"-----history-----\")\n", | |
| "history_list = dt.history() # Get the history list\n", | |
| "display(JSON(history_list)) \n", | |
| "pretty_json = json.dumps(history_list, indent=4) # Convert to pretty JSON\n", | |
| "print(pretty_json) # Print nicely formatted JSON\n", | |
| "# print(json.dumps(dt.history()))\n", | |
| "\n", | |
| "\n", | |
| "pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "7695ba21-92fa-4bff-a7e8-54e31a951e91", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "dt.delete (predicate = None) #(storage_options=storage_options)\n", | |
| "dt.vacuum()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "370ba660-9204-46cb-b4ff-9b62a7f2ed91", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = dt.to_pandas()\n", | |
| "df_filtered = df[df[\"foo\"] != 3]\n", | |
| "from deltalake.writer import write_deltalake\n", | |
| "write_deltalake(\n", | |
| " f\"abfs://{container}/delta_table_pandas\",\n", | |
| " df_filtered,\n", | |
| " mode=\"overwrite\",\n", | |
| " storage_options=storage_options)\n", | |
| "dt.vacuum()\n", | |
| "\n", | |
| "\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "426a3b49-d2f7-473c-9676-5a1d8f92da4d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment