Skip to content

Instantly share code, notes, and snippets.

@emcake
Created November 28, 2023 22:12
Show Gist options
  • Save emcake/acc1aa233339a5b3534e2f54702dd46e to your computer and use it in GitHub Desktop.
Save emcake/acc1aa233339a5b3534e2f54702dd46e to your computer and use it in GitHub Desktop.
Merging DeltaTables
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import deltalake as dl\n",
"import tempfile\n",
"import pyarrow as pa\n",
"from typing import List, Dict"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def get_some_data(keys: List[int], partitions:List[int], value: int) -> List[Dict[str, int]]:\n",
" data: List[Dict[str, int]] = []\n",
"\n",
" for p in partitions:\n",
" for k in keys:\n",
" data.append({'partition' : p, 'key' : k, 'value': value})\n",
"\n",
" return data\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"initial_keys = [1,2,3,5,6,7,9,10]\n",
"\n",
"initial_partitions = [1,2,3,4]\n",
"\n",
"data = get_some_data(initial_keys, initial_partitions, 1)\n",
"\n",
"table_location = tempfile.mkdtemp()\n",
"\n",
"dl.write_deltalake(table_location, pa.Table.from_pylist(data), partition_by=['partition'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tbl = dl.DeltaTable(table_location)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['partition=3/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet',\n",
" 'partition=4/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet',\n",
" 'partition=2/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet',\n",
" 'partition=1/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files_before = tbl.files()\n",
"files_before"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>partition</th>\n",
" <th>key</th>\n",
" <th>value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>4</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>4</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" partition key value\n",
"24 1 1 1\n",
"25 1 2 1\n",
"26 1 3 1\n",
"27 1 5 1\n",
"28 1 6 1\n",
"29 1 7 1\n",
"30 1 9 1\n",
"31 1 10 1\n",
"16 2 1 1\n",
"17 2 2 1\n",
"18 2 3 1\n",
"19 2 5 1\n",
"20 2 6 1\n",
"21 2 7 1\n",
"22 2 9 1\n",
"23 2 10 1\n",
"0 3 1 1\n",
"1 3 2 1\n",
"2 3 3 1\n",
"3 3 5 1\n",
"4 3 6 1\n",
"5 3 7 1\n",
"6 3 9 1\n",
"7 3 10 1\n",
"8 4 1 1\n",
"9 4 2 1\n",
"10 4 3 1\n",
"11 4 5 1\n",
"12 4 6 1\n",
"13 4 7 1\n",
"14 4 9 1\n",
"15 4 10 1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tbl.to_pandas().sort_values(['partition', 'key'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'num_source_rows': 30,\n",
" 'num_target_rows_inserted': 14,\n",
" 'num_target_rows_updated': 16,\n",
" 'num_target_rows_deleted': 0,\n",
" 'num_target_rows_copied': 16,\n",
" 'num_output_rows': 46,\n",
" 'num_target_files_added': 5,\n",
" 'num_target_files_removed': 4,\n",
" 'execution_time_ms': 19,\n",
" 'scan_time_ms': 0,\n",
" 'rewrite_time_ms': 17}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"more_data = get_some_data(list(range(1,11)), list(range(3,6)), 2)\n",
"\n",
"tbl.merge(pa.Table.from_pylist(more_data), \"source.key = target.key and source.partition = target.partition\", source_alias=\"source\", target_alias=\"target\").when_matched_update_all().when_not_matched_insert_all().execute()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"tbl.update_incremental()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>partition</th>\n",
" <th>key</th>\n",
" <th>value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>3</td>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>4</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>4</td>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>4</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>5</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>5</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>5</td>\n",
" <td>9</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>5</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" partition key value\n",
"8 1 1 1\n",
"9 1 2 1\n",
"10 1 3 1\n",
"11 1 5 1\n",
"12 1 6 1\n",
"13 1 7 1\n",
"14 1 9 1\n",
"15 1 10 1\n",
"0 2 1 1\n",
"1 2 2 1\n",
"2 2 3 1\n",
"3 2 5 1\n",
"4 2 6 1\n",
"5 2 7 1\n",
"6 2 9 1\n",
"7 2 10 1\n",
"36 3 1 2\n",
"37 3 2 2\n",
"38 3 3 2\n",
"44 3 4 2\n",
"39 3 5 2\n",
"40 3 6 2\n",
"41 3 7 2\n",
"45 3 8 2\n",
"42 3 9 2\n",
"43 3 10 2\n",
"26 4 1 2\n",
"27 4 2 2\n",
"28 4 3 2\n",
"34 4 4 2\n",
"29 4 5 2\n",
"30 4 6 2\n",
"31 4 7 2\n",
"35 4 8 2\n",
"32 4 9 2\n",
"33 4 10 2\n",
"16 5 1 2\n",
"17 5 2 2\n",
"18 5 3 2\n",
"19 5 4 2\n",
"20 5 5 2\n",
"21 5 6 2\n",
"22 5 7 2\n",
"23 5 8 2\n",
"24 5 9 2\n",
"25 5 10 2"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tbl.to_pandas().sort_values(['partition', 'key'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>path</th>\n",
" <th>size_bytes</th>\n",
" <th>modification_time</th>\n",
" <th>data_change</th>\n",
" <th>partition_values</th>\n",
" <th>num_records</th>\n",
" <th>null_count</th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>partition=2/part-00001-45a744d1-5f10-4fcd-a91b...</td>\n",
" <td>945</td>\n",
" <td>2023-11-28 22:08:15.911</td>\n",
" <td>True</td>\n",
" <td>{'partition': 2}</td>\n",
" <td>8</td>\n",
" <td>{'partition': None, 'key': 0, 'value': 0}</td>\n",
" <td>{'partition': None, 'key': 1, 'value': 1}</td>\n",
" <td>{'partition': None, 'key': 10, 'value': 1}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>partition=1/part-00001-bcbc0521-6330-4c0d-a6ba...</td>\n",
" <td>945</td>\n",
" <td>2023-11-28 22:08:15.911</td>\n",
" <td>True</td>\n",
" <td>{'partition': 1}</td>\n",
" <td>8</td>\n",
" <td>{'partition': None, 'key': 0, 'value': 0}</td>\n",
" <td>{'partition': None, 'key': 1, 'value': 1}</td>\n",
" <td>{'partition': None, 'key': 10, 'value': 1}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>partition=5/part-00001-540383e6-716b-44ca-b7e1...</td>\n",
" <td>961</td>\n",
" <td>2023-11-28 22:08:15.912</td>\n",
" <td>True</td>\n",
" <td>{'partition': 5}</td>\n",
" <td>10</td>\n",
" <td>{'partition': None, 'key': 0, 'value': 0}</td>\n",
" <td>{'partition': None, 'key': 1, 'value': 2}</td>\n",
" <td>{'partition': None, 'key': 10, 'value': 2}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>partition=4/part-00001-0e554628-ebf8-4fea-9563...</td>\n",
" <td>961</td>\n",
" <td>2023-11-28 22:08:15.912</td>\n",
" <td>True</td>\n",
" <td>{'partition': 4}</td>\n",
" <td>10</td>\n",
" <td>{'partition': None, 'key': 0, 'value': 0}</td>\n",
" <td>{'partition': None, 'key': 1, 'value': 2}</td>\n",
" <td>{'partition': None, 'key': 10, 'value': 2}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>partition=3/part-00001-dcc36881-4a3f-46f7-8cee...</td>\n",
" <td>961</td>\n",
" <td>2023-11-28 22:08:15.912</td>\n",
" <td>True</td>\n",
" <td>{'partition': 3}</td>\n",
" <td>10</td>\n",
" <td>{'partition': None, 'key': 0, 'value': 0}</td>\n",
" <td>{'partition': None, 'key': 1, 'value': 2}</td>\n",
" <td>{'partition': None, 'key': 10, 'value': 2}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" path size_bytes \\\n",
"0 partition=2/part-00001-45a744d1-5f10-4fcd-a91b... 945 \n",
"1 partition=1/part-00001-bcbc0521-6330-4c0d-a6ba... 945 \n",
"2 partition=5/part-00001-540383e6-716b-44ca-b7e1... 961 \n",
"3 partition=4/part-00001-0e554628-ebf8-4fea-9563... 961 \n",
"4 partition=3/part-00001-dcc36881-4a3f-46f7-8cee... 961 \n",
"\n",
" modification_time data_change partition_values num_records \\\n",
"0 2023-11-28 22:08:15.911 True {'partition': 2} 8 \n",
"1 2023-11-28 22:08:15.911 True {'partition': 1} 8 \n",
"2 2023-11-28 22:08:15.912 True {'partition': 5} 10 \n",
"3 2023-11-28 22:08:15.912 True {'partition': 4} 10 \n",
"4 2023-11-28 22:08:15.912 True {'partition': 3} 10 \n",
"\n",
" null_count \\\n",
"0 {'partition': None, 'key': 0, 'value': 0} \n",
"1 {'partition': None, 'key': 0, 'value': 0} \n",
"2 {'partition': None, 'key': 0, 'value': 0} \n",
"3 {'partition': None, 'key': 0, 'value': 0} \n",
"4 {'partition': None, 'key': 0, 'value': 0} \n",
"\n",
" min \\\n",
"0 {'partition': None, 'key': 1, 'value': 1} \n",
"1 {'partition': None, 'key': 1, 'value': 1} \n",
"2 {'partition': None, 'key': 1, 'value': 2} \n",
"3 {'partition': None, 'key': 1, 'value': 2} \n",
"4 {'partition': None, 'key': 1, 'value': 2} \n",
"\n",
" max \n",
"0 {'partition': None, 'key': 10, 'value': 1} \n",
"1 {'partition': None, 'key': 10, 'value': 1} \n",
"2 {'partition': None, 'key': 10, 'value': 2} \n",
"3 {'partition': None, 'key': 10, 'value': 2} \n",
"4 {'partition': None, 'key': 10, 'value': 2} "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pa.Table.from_batches([tbl.get_add_actions()]).to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['partition=2/part-00001-45a744d1-5f10-4fcd-a91b-3e60a2f1ddec-c000.snappy.parquet',\n",
" 'partition=1/part-00001-bcbc0521-6330-4c0d-a6ba-93492ffc99d6-c000.snappy.parquet',\n",
" 'partition=5/part-00001-540383e6-716b-44ca-b7e1-2d0cd0f1cb5b-c000.snappy.parquet',\n",
" 'partition=4/part-00001-0e554628-ebf8-4fea-9563-78f67b189302-c000.snappy.parquet',\n",
" 'partition=3/part-00001-dcc36881-4a3f-46f7-8cee-324ab6b00eda-c000.snappy.parquet']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files_after = tbl.files()\n",
"files_after"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"any([f in files_after for f in files_before])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "systools39",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment