Last active
March 1, 2019 15:36
-
-
Save MaxHalford/142a724679226ca8504726c9be081a17 to your computer and use it in GitHub Desktop.
NCAA rating formulas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Rating percentage index (RPI). | |
Example taken from https://www.wikiwand.com/en/Rating_percentage_index#/Basketball_formula | |
""" | |
X = pd.DataFrame( | |
data=[ | |
(2010, 'UConn', 64, 'Kansas', 57), | |
(2010, 'UConn', 82, 'Duke', 68), | |
(2010, 'Wisconsin', 71, 'UConn', 72), | |
(2010, 'Kansas', 69, 'UConn', 62), | |
(2010, 'Duke', 81, 'Wisconsin', 70), | |
(2010, 'Wisconsin', 52, 'Kansas', 62) | |
], | |
columns=['Season', 'T1', 'T1_Score', 'T2', 'T2_Score'] | |
) | |
X = pd.concat([ | |
X[['Season', 'T1', 'T1_Score', 'T2', 'T2_Score']], | |
X[['Season', 'T2', 'T2_Score', 'T1', 'T1_Score']].rename(columns={ | |
'T1': 'T2', | |
'T2': 'T1', | |
'T1_Score': 'T2_Score', | |
'T2_Score': 'T1_Score' | |
}) | |
]) | |
victory = lambda x: x['T1_Score'] > x['T2_Score'] | |
win_rates = X.assign(victory=victory).groupby(['Season', 'T1'])['victory'].agg(['mean', 'count']).to_dict() | |
matchups = X.assign(victory=victory).groupby(['Season', 'T1', 'T2'])['victory'].agg(['sum', 'count']).to_dict() | |
n_matches = X.groupby(['Season', 'T1']).size().to_dict() | |
opponents = X.groupby(['Season', 'T1'])['T2'].unique().to_dict() | |
def update_mean(mean, count, removed_sum, removed_count): | |
return (mean * count - removed_sum) / (count - removed_count) | |
def calc_wp(season, team): | |
return win_rates['mean'][(season, team)] | |
def calc_owp(season, team): | |
return 1 / n_matches[(season, team)] * sum( | |
update_mean( | |
mean=win_rates['mean'][(season, opponent)], | |
count=win_rates['count'][(season, opponent)], | |
removed_sum=matchups['sum'][(season, opponent, team)], | |
removed_count=matchups['count'][(season, opponent, team)] | |
) * matchups['count'][(season, opponent, team)] | |
for opponent in opponents[(season, team)] | |
) | |
def calc_oowp(season, team, owps): | |
return 1 / n_matches[(season, team)] * sum( | |
owps[opponent] * matchups['count'][(season, opponent, team)] | |
for opponent in opponents[(season, team)] | |
) | |
def calc_rpi(wp, owp, oowp): | |
return wp * .25 + owp * .5 + oowp * .25 | |
seasons = regular['Season'].unique() | |
wps = { | |
season: { | |
team: calc_wp(season, team) | |
for team in regular.query(f'Season == {season}')['T1'].unique() | |
} | |
for season in seasons | |
} | |
owps = { | |
season: { | |
team: calc_owp(season, team) | |
for team in regular.query(f'Season == {season}')['T1'].unique() | |
} | |
for season in seasons | |
} | |
oowps = { | |
season: { | |
team: calc_oowp(season, team, owps[season]) | |
for team in regular.query(f'Season == {season}')['T1'].unique() | |
} | |
for season in seasons | |
} | |
rpis = { | |
season: { | |
team: calc_rpi(wps[season][team], owps[season][team], oowps[season][team]) | |
for team in regular.query(f'Season == {season}')['T1'].unique() | |
} | |
for season in seasons | |
} | |
rpis = pd.DataFrame.from_dict(rpis, orient='columns').stack() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Simple rating system (SRS). | |
""" | |
from scipy import optimize | |
X = pd.DataFrame( | |
data=[ | |
(2010, 'UConn', 64, 'Kansas', 57), | |
(2010, 'UConn', 82, 'Duke', 68), | |
(2010, 'Wisconsin', 71, 'UConn', 72), | |
(2010, 'Kansas', 69, 'UConn', 62), | |
(2010, 'Duke', 81, 'Wisconsin', 70), | |
(2010, 'Wisconsin', 52, 'Kansas', 62) | |
], | |
columns=['Season', 'T1', 'T1_Score', 'T2', 'T2_Score'] | |
) | |
srss = {} | |
for season in X['Season'].unique(): | |
season_results = regular_results.query(f'Season == {season}') | |
teams = season_results['T1'].unique() | |
G = pd.concat( | |
[ | |
( | |
(season_results['T1'] == team).astype(int) - \ | |
(season_results['T2'] == team).astype(int) | |
).rename(team) | |
for team in teams | |
], | |
axis='columns' | |
) | |
S = season_results['T1_Score'] - season_results['T2_Score'] | |
R = optimize.lsq_linear(G, S).x | |
srss[season] = pd.Series(R, index=teams) | |
srss = pd.DataFrame.from_dict(srss).stack() | |
df = df.join(srss.rename('T1_srs'), on=['T1', 'Season']) | |
df = df.join(srss.rename('T2_srs'), on=['T2', 'Season']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For the 2019 NCAA Kaggle women competition I found that the RPI I calculated this way had a 0.91 correlation with the team seeds. I didn't take into account home and away weighting.