Last active
June 30, 2018 03:49
-
-
Save ipeirotis/10e6b7b42f1d17030fb5aaa15a904312 to your computer and use it in GitHub Desktop.
Plot the distribution of salaries in baseball leagues over time
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Long version with full comments | |
# We want to plot the distribution of salaries in baseball leagues over time | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# Download a dataset from Lahman's database of baseball statistics | |
# with the salaries of baseball players, over the years from 1985 until 2016, | |
# together with the team of the player, and the league in which the team is playing. | |
df = pd.read_csv("https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/master/core/Salaries.csv") | |
# Just making the headers friendlier/readable | |
rename_dict = {"yearID": "Year", "teamID": "Team", "lgID": "League", "playerID": "Player", "salary": "Salary"} | |
df.rename(rename_dict, axis='columns', inplace=True) | |
# Create one figure, with just one subplot (figure is the overall thing, ax corresponds to the individual plot[s] ) | |
fig, ax = plt.subplots() | |
# Increase the size of the plot | |
fig.set_size_inches(20, 4) | |
# Add a horizontal grid line | |
ax.grid(which='major', color='gray', linewidth=0.25, alpha=0.5) | |
# Remove salaries of zero (so that we can take the log), and a couple of outliers (that is just for beautification) | |
# Specifically, we remove the following entries: | |
# 6179 1993 NYA AL jamesdi01 0 | |
# 6194 1993 NYA AL silveda01 10900 | |
# 6659 1994 CHA AL carych01 50000 | |
# 9679 1997 FLO NL penaal01 50000 | |
# 12007 1999 PIT NL martija02 0 | |
df = df[df.Salary > 50000] | |
# Take the log of the salary so that we can plot the distribution of the logSalary over time | |
# (Alternatively, we can make the y-axis below to be log-scaled) | |
df['logSalary'] = np.log10(df.Salary) | |
# Plot the distribution of salaries over time using Violin plots. | |
# We put the year on the x-axis, and we plot the distribution across the y-axis | |
# We also use a "split" violin plot and we plot one side the distribution for NL and on the other side the AL | |
sns.violinplot( | |
data=df, | |
x='Year', | |
y='logSalary', | |
color='orange', | |
hue='League', | |
split=True, | |
ax=ax) | |
fig.savefig('baseball.png', bbox_inches='tight') |
Author
ipeirotis
commented
Jun 30, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment