Last active
March 30, 2022 16:22
-
-
Save walkerh/a697f0d530a8836c2daa8d28809c7892 to your computer and use it in GitHub Desktop.
Combining Aggregation With Column Update
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "357ab29a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "49cc1fbf", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>m</th>\n", | |
" <td>a</td>\n", | |
" <td>1</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n</th>\n", | |
" <td>a</td>\n", | |
" <td>2</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>o</th>\n", | |
" <td>a</td>\n", | |
" <td>3</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>p</th>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>q</th>\n", | |
" <td>b</td>\n", | |
" <td>1</td>\n", | |
" <td>e</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>r</th>\n", | |
" <td>b</td>\n", | |
" <td>2</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>s</th>\n", | |
" <td>b</td>\n", | |
" <td>3</td>\n", | |
" <td>g</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C\n", | |
"m a 1 a\n", | |
"n a 2 b\n", | |
"o a 3 c\n", | |
"p a 4 d\n", | |
"q b 1 e\n", | |
"r b 2 f\n", | |
"s b 3 g" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df0 = pd.DataFrame(\n", | |
" dict(\n", | |
" A=iter(\"aaaabbb\"),\n", | |
" B=[1, 2, 3, 4, 1, 2, 3],\n", | |
" C=iter(\"abcdefg\"),\n", | |
" ),\n", | |
" index=iter(\"mnopqrs\"),\n", | |
")\n", | |
"# Note:\n", | |
"# - the custom index\n", | |
"# - C has a unique value for every row\n", | |
"df0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "b1212a70", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"g = df0.groupby(\"A\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "49a8f037", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"A\n", | |
"a 4\n", | |
"b 3\n", | |
"Name: B, dtype: int64" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"m = g[\"B\"].max()\n", | |
"m" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "471f717c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"pandas.core.series.Series" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"type(m)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "fd879d6a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>B_max</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>m</th>\n", | |
" <td>a</td>\n", | |
" <td>1</td>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n</th>\n", | |
" <td>a</td>\n", | |
" <td>2</td>\n", | |
" <td>b</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>o</th>\n", | |
" <td>a</td>\n", | |
" <td>3</td>\n", | |
" <td>c</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>p</th>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>q</th>\n", | |
" <td>b</td>\n", | |
" <td>1</td>\n", | |
" <td>e</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>r</th>\n", | |
" <td>b</td>\n", | |
" <td>2</td>\n", | |
" <td>f</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>s</th>\n", | |
" <td>b</td>\n", | |
" <td>3</td>\n", | |
" <td>g</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C B_max\n", | |
"m a 1 a 4\n", | |
"n a 2 b 4\n", | |
"o a 3 c 4\n", | |
"p a 4 d 4\n", | |
"q b 1 e 3\n", | |
"r b 2 f 3\n", | |
"s b 3 g 3" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df1 = df0.join(m, on=\"A\", rsuffix=\"_max\")\n", | |
"df1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "eb00c973", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>B_max</th>\n", | |
" <th>must_change</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>m</th>\n", | |
" <td>a</td>\n", | |
" <td>1</td>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n</th>\n", | |
" <td>a</td>\n", | |
" <td>2</td>\n", | |
" <td>b</td>\n", | |
" <td>4</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>o</th>\n", | |
" <td>a</td>\n", | |
" <td>3</td>\n", | |
" <td>c</td>\n", | |
" <td>4</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>p</th>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" <td>4</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>q</th>\n", | |
" <td>b</td>\n", | |
" <td>1</td>\n", | |
" <td>e</td>\n", | |
" <td>3</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>r</th>\n", | |
" <td>b</td>\n", | |
" <td>2</td>\n", | |
" <td>f</td>\n", | |
" <td>3</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>s</th>\n", | |
" <td>b</td>\n", | |
" <td>3</td>\n", | |
" <td>g</td>\n", | |
" <td>3</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C B_max must_change\n", | |
"m a 1 a 4 False\n", | |
"n a 2 b 4 False\n", | |
"o a 3 c 4 False\n", | |
"p a 4 d 4 True\n", | |
"q b 1 e 3 False\n", | |
"r b 2 f 3 False\n", | |
"s b 3 g 3 True" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df1[\"must_change\"] = df1.B == df1.B_max\n", | |
"df1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "e6c78a64", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>C</th>\n", | |
" <th>must_change</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>m</th>\n", | |
" <td>a</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n</th>\n", | |
" <td>b</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>o</th>\n", | |
" <td>c</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>p</th>\n", | |
" <td>d</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>q</th>\n", | |
" <td>e</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>r</th>\n", | |
" <td>f</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>s</th>\n", | |
" <td>g</td>\n", | |
" <td>True</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" C must_change\n", | |
"m a False\n", | |
"n b False\n", | |
"o c False\n", | |
"p d True\n", | |
"q e False\n", | |
"r f False\n", | |
"s g True" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sub = df1[[\"C\", \"must_change\"]]\n", | |
"sub" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "45659bd6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('m', 'a', False),\n", | |
" ('n', 'b', False),\n", | |
" ('o', 'c', False),\n", | |
" ('p', 'd', True),\n", | |
" ('q', 'e', False),\n", | |
" ('r', 'f', False),\n", | |
" ('s', 'g', True)]" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[(i, value, must_change) for i, value, must_change in sub.to_records()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "3b462905", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['a', 'b', 'c', 'y', 'e', 'f', 'y']" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[\"y\" if must_change else value for i, value, must_change in sub.to_records()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "dc40cf69", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>B_max</th>\n", | |
" <th>must_change</th>\n", | |
" <th>C_new</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>m</th>\n", | |
" <td>a</td>\n", | |
" <td>1</td>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" <td>False</td>\n", | |
" <td>a</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>n</th>\n", | |
" <td>a</td>\n", | |
" <td>2</td>\n", | |
" <td>b</td>\n", | |
" <td>4</td>\n", | |
" <td>False</td>\n", | |
" <td>b</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>o</th>\n", | |
" <td>a</td>\n", | |
" <td>3</td>\n", | |
" <td>c</td>\n", | |
" <td>4</td>\n", | |
" <td>False</td>\n", | |
" <td>c</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>p</th>\n", | |
" <td>a</td>\n", | |
" <td>4</td>\n", | |
" <td>d</td>\n", | |
" <td>4</td>\n", | |
" <td>True</td>\n", | |
" <td>y</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>q</th>\n", | |
" <td>b</td>\n", | |
" <td>1</td>\n", | |
" <td>e</td>\n", | |
" <td>3</td>\n", | |
" <td>False</td>\n", | |
" <td>e</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>r</th>\n", | |
" <td>b</td>\n", | |
" <td>2</td>\n", | |
" <td>f</td>\n", | |
" <td>3</td>\n", | |
" <td>False</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>s</th>\n", | |
" <td>b</td>\n", | |
" <td>3</td>\n", | |
" <td>g</td>\n", | |
" <td>3</td>\n", | |
" <td>True</td>\n", | |
" <td>y</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B C B_max must_change C_new\n", | |
"m a 1 a 4 False a\n", | |
"n a 2 b 4 False b\n", | |
"o a 3 c 4 False c\n", | |
"p a 4 d 4 True y\n", | |
"q b 1 e 3 False e\n", | |
"r b 2 f 3 False f\n", | |
"s b 3 g 3 True y" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Use a Series to collect the results of a generator expression.\n", | |
"# That way we don't waste memory constructing a list object.\n", | |
"# Using a Series requires us to handle the possibility of a non-trivial index in the original DataFrame.\n", | |
"df1[\"C_new\"] = pd.Series(\n", | |
" (\"y\" if must_change else value for _, value, must_change in sub.to_records()),\n", | |
" index=sub.index,\n", | |
")\n", | |
"df1" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment