Created
July 21, 2017 19:02
-
-
Save magnusnissel/b486201f7613d03b7b0fd823fb36febd to your computer and use it in GitHub Desktop.
Initial basic attempt at lineup parsing (from NBA play-by-play logs)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
import pandas as pd | |
def check_lineup(r): | |
a_cols = ["A1", "A2", "A3", "A4", "A5"] | |
h_cols = ["H1", "H2", "H3", "H4", "H5"] | |
a = [True for p in r[a_cols] if p != ""] | |
h = [True for p in r[h_cols] if p != ""] | |
if not all(a) or not all(h): | |
r["LINEUP_ERROR"] = True | |
r["LINEUP_STATUS"] = "Not enough players" | |
return r | |
def identify_first_open(slots): | |
for i, s in enumerate(slots): | |
if s == "": | |
return i+1 | |
return i+1 | |
def get_player_slot(p, slots): | |
for i, s in enumerate(slots): | |
if s == p: | |
return i+1 | |
return None | |
def parse_lineups(df, away, home, box_df=None): | |
a_cols = ["A1", "A2", "A3", "A4", "A5"] | |
h_cols = ["H1", "H2", "H3", "H4", "H5"] | |
ln_cols = a_cols + h_cols | |
for c in ln_cols: | |
df[c] = "" | |
a_open = 1 | |
h_open = 1 | |
if box_df is not None: | |
starters = box_df[box_df["START_POSITION"].notnull()] | |
for i, r in starters.iterrows(): | |
if r["TEAM_ABBREVIATION"] == away: | |
slot = "A{}".format(a_open) | |
a_open += 1 | |
df.loc[0, slot] = r["PLAYER_NAME"] | |
if r["TEAM_ABBREVIATION"] == home: | |
slot = "H{}".format(h_open) | |
h_open += 1 | |
df.loc[0, slot] = r["PLAYER_NAME"] | |
df = df.fillna("") | |
df["DESC"] = df["HOMEDESCRIPTION"] + df["VISITORDESCRIPTION"] | |
for i, r in df.iterrows(): | |
try: | |
prev = df.loc[i-1].copy() | |
except KeyError: | |
prev = r.copy() | |
r[ln_cols] = prev[ln_cols] | |
df.loc[i, ln_cols] = prev[ln_cols] | |
if "SUB" in r["DESC"]: | |
t = r["PLAYER1_TEAM_ABBREVIATION"] | |
if t == away: | |
cols = a_cols | |
initial = "A" | |
elif t == home: | |
cols = h_cols | |
initial = "H" | |
#P2 enters, P1 leaves | |
p1id = r["PLAYER1_ID"] | |
p2id = r["PLAYER2_ID"] | |
p1_pos = get_player_slot(p1id, r[cols]) | |
if p1_pos is not None: | |
slot = "{}{}".format(initial, p1_pos) | |
r[slot] = p2id | |
df.loc[i, slot] = p2id | |
else: | |
for p in ["PLAYER1", "PLAYER2", "PLAYER3"]: | |
pid = r["{}_ID".format(p)] | |
pid = r["{}_NAME".format(p)] # Testing | |
t = r["{}_TEAM_ABBREVIATION".format(p)] | |
if t == away: | |
cols = a_cols | |
initial = "A" | |
elif t == home: | |
cols = h_cols | |
initial = "H" | |
else: | |
initial = "" | |
cols = [] | |
if cols: | |
first_open = identify_first_open(r[cols]) | |
if first_open > 5: # trying to fit too many players | |
df.loc[i, "LINEUP_ERROR"] = True | |
df.loc[i, "LINEUP_STATUS"] = "Too many players" | |
slot = "{}{}".format(initial, first_open) | |
if pid not in set(r[cols]): | |
r[slot] = pid | |
df.loc[i, slot] = pid | |
if "LINEUP_ERROR" not in df.columns: | |
df["LINEUP_ERROR"] = False | |
df["LINEUP_STATUS"] = "" | |
else: | |
df["LINEUP_ERROR"] = df["LINEUP_ERROR"].fillna(False) | |
df["LINEUP_STATUS"] = df["LINEUP_STATUS"].fillna("") | |
df = df.apply(check_lineup, axis=1) | |
for i, r in df.iterrows(): | |
if r["LINEUP_ERROR"] is True: | |
print(r["LINEUP_STATUS"]) | |
print(r[ln_cols]) | |
return df | |
def parse_lineups(pbp_path, box_path=None, away, home): | |
pdf = pd.read_csv(pbp_path, index_col=0) | |
if box_path is not None: | |
bdf = pd.read_csv(box_path, index_col=0) | |
else: | |
bdf = None | |
ldf = parse_lineups(pdf, away, home, bdf) | |
if __name__ == "__main__": | |
parse_lineups(csv_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment