Last active
September 14, 2024 13:24
-
-
Save rabbl/e03cbd310706952bb61b7ee9221b0261 to your computer and use it in GitHub Desktop.
Pandas CrossTab function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": [ | |
"# Understanding the Pandas Crosstab function\n", | |
"\n", | |
"In pandas, a crosstab is a type of table that displays the frequency distribution of two or more categorical variables. \n", | |
"It is essentially a way to summarize and compare the relationship between these variables.\n", | |
"\n", | |
"The pandas.crosstab() function creates this table, allowing you to cross-tabulate data and view the counts (or other aggregate functions) of different combinations of the variables. \n", | |
"\n", | |
"This is particularly useful for exploratory data analysis and understanding the relationships between categorical data.\n" | |
], | |
"id": "2a3b7b369b0cf684" | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.184033Z", | |
"start_time": "2024-09-14T13:16:40.919980Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"# Sample data\n", | |
"data = {'A': ['foo', 'foo', 'foo', 'bar', 'bar', 'foo'],\n", | |
" 'B': ['one', 'one', 'two', 'two', 'one', 'one']}\n", | |
"\n", | |
"df = pd.DataFrame(data)\n", | |
"df" | |
], | |
"id": "4e2802bb2311bb3e", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
" A B\n", | |
"0 foo one\n", | |
"1 foo one\n", | |
"2 foo two\n", | |
"3 bar two\n", | |
"4 bar one\n", | |
"5 foo one" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>foo</td>\n", | |
" <td>one</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>foo</td>\n", | |
" <td>one</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>foo</td>\n", | |
" <td>two</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>bar</td>\n", | |
" <td>two</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>bar</td>\n", | |
" <td>one</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>foo</td>\n", | |
" <td>one</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 1 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.221688Z", | |
"start_time": "2024-09-14T13:16:41.207416Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab without normalizing and margins\n", | |
"pd.crosstab(df['A'], df['B'], normalize=False, margins=True)" | |
], | |
"id": "103e126ac4582906", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two All\n", | |
"A \n", | |
"bar 1 1 2\n", | |
"foo 3 1 4\n", | |
"All 4 2 6" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" <th>All</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>All</th>\n", | |
" <td>4</td>\n", | |
" <td>2</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 2 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.247265Z", | |
"start_time": "2024-09-14T13:16:41.239711Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab without normalizing and without margins\n", | |
"pd.crosstab(df['A'], df['B'], normalize=False, margins=False)" | |
], | |
"id": "5e85abc924ac916e", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 1 1\n", | |
"foo 3 1" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 3 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.324696Z", | |
"start_time": "2024-09-14T13:16:41.317936Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab with normalize = 0\n", | |
"pd.crosstab(df['A'], df['B'], normalize=0, margins=False)" | |
], | |
"id": "a60cf8865627371b", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 0.50 0.50\n", | |
"foo 0.75 0.25" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>0.50</td>\n", | |
" <td>0.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>0.75</td>\n", | |
" <td>0.25</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 4 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.366558Z", | |
"start_time": "2024-09-14T13:16:41.359479Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab with normalize = 'index' \n", | |
"pd.crosstab(df['A'], df['B'], normalize='index', margins=False)" | |
], | |
"id": "96d313940c54f141", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 0.50 0.50\n", | |
"foo 0.75 0.25" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>0.50</td>\n", | |
" <td>0.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>0.75</td>\n", | |
" <td>0.25</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 5 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.471924Z", | |
"start_time": "2024-09-14T13:16:41.465515Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab with normalize = 1\n", | |
"pd.crosstab(df['A'], df['B'], normalize=1, margins=False)" | |
], | |
"id": "12f42e3363033d98", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 0.25 0.5\n", | |
"foo 0.75 0.5" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>0.25</td>\n", | |
" <td>0.5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>0.75</td>\n", | |
" <td>0.5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 6 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.522138Z", | |
"start_time": "2024-09-14T13:16:41.516239Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab with normalize = 'column' \n", | |
"pd.crosstab(df['A'], df['B'], normalize='columns', margins=False)" | |
], | |
"id": "20765830722c72d5", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 0.25 0.5\n", | |
"foo 0.75 0.5" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>0.25</td>\n", | |
" <td>0.5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>0.75</td>\n", | |
" <td>0.5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 7 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.632009Z", | |
"start_time": "2024-09-14T13:16:41.624016Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab with normalize = 'all' \n", | |
"pd.crosstab(df['A'], df['B'], normalize='all', margins=False)" | |
], | |
"id": "854764ba51960134", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 0.166667 0.166667\n", | |
"foo 0.500000 0.166667" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>0.166667</td>\n", | |
" <td>0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.166667</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 8 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2024-09-14T13:16:41.676628Z", | |
"start_time": "2024-09-14T13:16:41.670031Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Crosstab with normalize = true\n", | |
"pd.crosstab(df['A'], df['B'], normalize=True, margins=False)" | |
], | |
"id": "723d15edd9afab0", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"B one two\n", | |
"A \n", | |
"bar 0.166667 0.166667\n", | |
"foo 0.500000 0.166667" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>B</th>\n", | |
" <th>one</th>\n", | |
" <th>two</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>A</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>bar</th>\n", | |
" <td>0.166667</td>\n", | |
" <td>0.166667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>foo</th>\n", | |
" <td>0.500000</td>\n", | |
" <td>0.166667</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 9 | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment