Skip to content

Instantly share code, notes, and snippets.

@databyjp
Created February 25, 2022 03:42
Show Gist options
  • Save databyjp/c7ac6a67d2f30bc048fff50b28b73878 to your computer and use it in GitHub Desktop.
Save databyjp/c7ac6a67d2f30bc048fff50b28b73878 to your computer and use it in GitHub Desktop.
Add on-court players columns
def add_pbp_oncourt_columns(df):
"""
Add on-court player columns to the play-by-play dataframe.
Players based on substitution data and box-score data (for starters)
:param df: PBP dataframe
:return:
"""
df = df.sort_values(["GAME_ID", "actionNumber"])
df = df.reset_index(drop=True)
box_df = load_box_scores(data="player")
gm_dfs = list()
for gm_id in df["GAME_ID"].unique():
logger.info(f"Processing game {gm_id}")
gm_df = df[df["GAME_ID"] == gm_id]
tm_ids = [i for i in gm_df["teamId"].unique() if not np.isnan(i)]
for tm_i in range(2):
tm_id = tm_ids[tm_i]
tm_df = gm_df[gm_df["teamId"] == tm_id]
starter_list = box_df[
(box_df["TEAM_ID"] == tm_id) & (box_df["GAME_ID"] == gm_id) & (box_df["START_POSITION"] != "")
]["PLAYER_ID"].unique().tolist()
for i in range(5):
tm_df.loc[:, "player" + str(i + 1)] = starter_list[i]
subout_buffer = list()
subin_buffer = list()
for row in tm_df.itertuples():
if row.actionType == "substitution":
if row.subType == "out":
subout_buffer.append(row.personId)
else:
subin_buffer.append(row.personId)
if len(subin_buffer) > 0 and len(subout_buffer) > 0:
subout = subout_buffer.pop(0)
subin = subin_buffer.pop(0)
for j in range(5):
tmpcol = "player" + str(j + 1)
if getattr(row, tmpcol) == subout:
tm_df.loc[tm_df["actionNumber"] >= row.actionNumber, tmpcol] = subin
if len(subout_buffer) != 0 or len(subin_buffer) != 0:
logger.warning(
f"Something went wrong parsing {gm_id} for {tm_id}! subin_buffer: {subin_buffer}, subout_buffer: {subout_buffer}")
tm_df.rename({"player" + str(j + 1): f"tm_{tm_i}_player" + str(j + 1) for j in range(5)}, axis=1,
inplace=True)
gm_df = pd.merge(
gm_df,
tm_df[["actionNumber"] + [f"tm_{tm_i}_player" + str(j + 1) for j in range(5)]],
left_on="actionNumber",
right_on="actionNumber",
how="left",
)
for tm_i in range(2):
for j in range(5):
gm_df[f"tm_{tm_i}_player{j + 1}"] = gm_df[f"tm_{tm_i}_player{j + 1}"].ffill().bfill()
gm_dfs.append(gm_df)
proc_df = pd.concat(gm_dfs)
for pl_c in [c for c in proc_df.columns if "_player" in c]:
proc_df[pl_c] = proc_df[pl_c].astype(int)
return proc_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment