Skip to content

Instantly share code, notes, and snippets.

@vaclavdekanovsky
Last active October 5, 2020 22:09
Show Gist options
  • Select an option

  • Save vaclavdekanovsky/82003df3a39b76f77b8c1c40c1f54016 to your computer and use it in GitHub Desktop.

Select an option

Save vaclavdekanovsky/82003df3a39b76f77b8c1c40c1f54016 to your computer and use it in GitHub Desktop.
How to create a histogram using Plotly.Express Bar chart
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Customize histogram with own calculation and px.bar()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"import pandas as pd\n",
"import numpy as np\n",
"# pd.options.mode.chained_assignment = None"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Tourism data\n",
"long_df = pd.read_pickle(\"../Preprocess/long.plk\")\n",
"yr2018 = long_df[long_df[\"years\"]==\"2018\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# create 19 bins starting with 0 up to 90M\n",
"bins = np.linspace(0, 90_000_000, 19)\n",
"\n",
"# use pd.cut to create the bins. In order to include zero, `include_lowest` is set to True\n",
"yr2018[\"hist\"] = pd.cut(yr2018[\"visitors\"], bins, include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"IntervalIndex([(-0.001, 5000000.0], (5000000.0, 10000000.0], (10000000.0, 15000000.0], (15000000.0, 20000000.0], (20000000.0, 25000000.0] ... (65000000.0, 70000000.0], (70000000.0, 75000000.0], (75000000.0, 80000000.0], (80000000.0, 85000000.0], (85000000.0, 90000000.0]],\n",
" closed='right',\n",
" dtype='interval[float64]')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pd.cut creates an interval category which is sorted from lowest bin to the greatest bin\n",
"yr2018[\"hist\"].cat.categories"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bins</th>\n",
" <th>hist</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>157</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(5000000.0, 10000000.0]</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(10000000.0, 15000000.0]</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bins hist\n",
"0 (-0.001, 5000000.0] 157\n",
"1 (5000000.0, 10000000.0] 17\n",
"2 (10000000.0, 15000000.0] 12"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# count the values in each bin. Bins are sorted based on the occurance (from most populated to the least one)\n",
"agg = yr2018[\"hist\"].value_counts()\n",
"\n",
"# sort the values according to the bins (`sort_index`), turn into data frame (`to_frame`) and reset index\n",
"agg = agg.sort_index().to_frame().reset_index()\n",
"\n",
"# rename index (containing the bin range e.g. \"(5000000.0, 10000000.0]\" to bins)\n",
"agg.rename(columns={\"index\":\"bins\"}, inplace=True)\n",
"\n",
"# Plotly cannot work with categories index, so we need to turn it into string\n",
"agg[\"bins\"] = agg[\"bins\"].astype(\"str\")\n",
"\n",
"agg.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# now we can use the aggregated values in the plotly bar chart\n",
"fig = px.bar(agg, x=\"bins\", y=\"hist\", text=\"hist\",\n",
" title=\"Histogram using pd.cut and px.bar\", \n",
" labels={\"hist\":\"count\"})\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to display the just the bin-border numbers and not the bin ranges, let's get the border values into a separate column using `pd.cut(df, bin, labels=bins[1:])`. If `bins` variable is [0,1,2] then `bins[1:]` is [1,2]. This way `plotly` assigns the counts to the higher boundary of the bin, but the bar chart will display this number in the middle of the bar, which is exactly the same way, how `px.histogram()` is doing it. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# bin under the bins higher boundary using labels argument\n",
"yr2018[\"hist_border\"] = pd.cut(yr2018[\"visitors\"], bins=bins, labels=bins[1:], include_lowest=True)\n",
"# bins containing both lower and higher boundary\n",
"yr2018[\"bins\"] = pd.cut(yr2018[\"visitors\"], bins=bins, include_lowest=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bins</th>\n",
" <th>hist_border</th>\n",
" <th>values</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>5000000.0</td>\n",
" <td>157.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>10000000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>15000000.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bins hist_border values\n",
"0 (-0.001, 5000000.0] 5000000.0 157.0\n",
"1 (-0.001, 5000000.0] 10000000.0 NaN\n",
"2 (-0.001, 5000000.0] 15000000.0 NaN"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# aggregate through boths the bins and the higher_boundary\n",
"agg = yr2018.groupby([\"bins\",\"hist_border\"]).count()[\"Country Name\"]\n",
"agg = agg.sort_index().to_frame().reset_index()\n",
"agg[\"bins\"] = agg[\"bins\"].astype(\"str\")\n",
"agg.rename(columns={\"Country Name\":\"values\"}, inplace=True)\n",
"agg.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plot the bar charts as usuall via px.bar\n",
"fig = px.bar(agg, x=\"hist_border\", y=\"values\", text=\"values\",\n",
" title=\"Histogram using pd.cut with labels and px.bar\", \n",
" hover_data={\"bins\":True})\n",
"\n",
"# remove the gaps between the bars\n",
"fig.update_layout(bargap=0)\n",
"\n",
"# show the image\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Split the bars based on a category (e.g. Region)\n",
"Achieve this by grouping the data, by both the bins and the categorical value `Region`"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bins</th>\n",
" <th>hist_border</th>\n",
" <th>Region</th>\n",
" <th>visitors</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>5000000.0</td>\n",
" <td>East Asia &amp; Pacific</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>5000000.0</td>\n",
" <td>Europe &amp; Central Asia</td>\n",
" <td>28.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>5000000.0</td>\n",
" <td>Latin America &amp; Caribbean</td>\n",
" <td>37.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>5000000.0</td>\n",
" <td>Middle East &amp; North Africa</td>\n",
" <td>13.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(-0.001, 5000000.0]</td>\n",
" <td>5000000.0</td>\n",
" <td>North America</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2263</th>\n",
" <td>(85000000.0, 90000000.0]</td>\n",
" <td>90000000.0</td>\n",
" <td>Latin America &amp; Caribbean</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2264</th>\n",
" <td>(85000000.0, 90000000.0]</td>\n",
" <td>90000000.0</td>\n",
" <td>Middle East &amp; North Africa</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2265</th>\n",
" <td>(85000000.0, 90000000.0]</td>\n",
" <td>90000000.0</td>\n",
" <td>North America</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2266</th>\n",
" <td>(85000000.0, 90000000.0]</td>\n",
" <td>90000000.0</td>\n",
" <td>South Asia</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2267</th>\n",
" <td>(85000000.0, 90000000.0]</td>\n",
" <td>90000000.0</td>\n",
" <td>Sub-Saharan Africa</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2268 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" bins hist_border Region \\\n",
"0 (-0.001, 5000000.0] 5000000.0 East Asia & Pacific \n",
"1 (-0.001, 5000000.0] 5000000.0 Europe & Central Asia \n",
"2 (-0.001, 5000000.0] 5000000.0 Latin America & Caribbean \n",
"3 (-0.001, 5000000.0] 5000000.0 Middle East & North Africa \n",
"4 (-0.001, 5000000.0] 5000000.0 North America \n",
"... ... ... ... \n",
"2263 (85000000.0, 90000000.0] 90000000.0 Latin America & Caribbean \n",
"2264 (85000000.0, 90000000.0] 90000000.0 Middle East & North Africa \n",
"2265 (85000000.0, 90000000.0] 90000000.0 North America \n",
"2266 (85000000.0, 90000000.0] 90000000.0 South Asia \n",
"2267 (85000000.0, 90000000.0] 90000000.0 Sub-Saharan Africa \n",
"\n",
" visitors \n",
"0 24.0 \n",
"1 28.0 \n",
"2 37.0 \n",
"3 13.0 \n",
"4 1.0 \n",
"... ... \n",
"2263 0.0 \n",
"2264 0.0 \n",
"2265 0.0 \n",
"2266 0.0 \n",
"2267 0.0 \n",
"\n",
"[2268 rows x 4 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agg = yr2018.groupby([\"hist\",\"hist_border\",\"Region\"]).count()[\"visitors\"].to_frame().reset_index()\n",
"agg.rename(columns={\"hist\":\"bins\"}, inplace=True)\n",
"agg[\"bins\"] = agg[\"bins\"].astype(\"str\")\n",
"agg[\"visitors\"] = agg[\"visitors\"].fillna(0)\n",
"agg"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.bar(agg, x=\"bins\", y=\"visitors\", color=\"Region\", text=\"visitors\", \n",
" title=\"Histogram using pd.cut and px.bar\", )\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.bar(agg, x=\"hist_border\", y=\"visitors\", color=\"Region\", text=\"visitors\", \n",
" title=\"Histogram using pd.cut and px.bar\", )\n",
"fig.update_layout(bargap=0)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simulate the histogram with numpy.histogram\n",
"This is possibly the most stright forward method, because `np.histogram` counts the values in each bin, while keeping the order of the bins. When the results are turned into a data frame, you can easily feed Plotly with it."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"counts, bins = np.histogram(yr2018[\"visitors\"], bins=bins)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bins</th>\n",
" <th>counts</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5000000.0</td>\n",
" <td>157</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10000000.0</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>15000000.0</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>80000000.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>85000000.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>90000000.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bins counts\n",
"0 5000000.0 157\n",
"1 10000000.0 17\n",
"2 15000000.0 12\n",
"15 80000000.0 1\n",
"16 85000000.0 1\n",
"17 90000000.0 1"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# explude the first values from the bins (it's the starting point)\n",
"df = pd.DataFrame({\"bins\":bins[1:], \"counts\":counts})\n",
"pd.concat([df.head(3),df.tail(3)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = px.bar(df, x=\"bins\", y=\"counts\", text=\"counts\", title=\"Histogram simulation via px.bar\")\n",
"fig.update_layout(bargap=0)\n",
"fig.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment