Last active
December 18, 2023 19:21
-
-
Save 903124/3c6f0dc0a100d78b8622573ef4c504f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
full_name | abbreviation | |
---|---|---|
Abilene Christian | AblCh | |
Air Force | AFA | |
Akron | Akron | |
Alabama | Alab | |
Alabama A&M | AlaAM | |
Albany | Alban | |
Alcorn State | AlcSt | |
Appalachian State | AppSt | |
Arizona | Ariz | |
Arizona State | ArzSt | |
Arkansas | Ark | |
Arkansas State | ArkSt | |
Arkansas-Pine Bluff | ArkPB | |
Army | Army | |
Auburn | Aub | |
Austin Peay | APeay | |
BYU | BYU | |
Ball State | BalSt | |
Baylor | Bayl | |
Bethune-Cookman | BthCk | |
Boise State | BoiSt | |
Boston College | BC | |
Bowling Green | BwGrn | |
Buffalo | Buff | |
California | Cal | |
Campbell | Camp | |
Central Arkansas | CArk | |
Central Connecticut | CConn | |
Central Michigan | CMich | |
Charleston Southern | ChaSo | |
Charlotte | Charl | |
Cincinnati | Cincy | |
Clemson | Clem | |
Coastal Carolina | CCaro | |
Colorado | Colo | |
Colorado State | ColSt | |
Connecticut | UConn | |
Delaware State | DelSt | |
Drake | Drake | |
Duke | Duke | |
Duquesne | Duqsn | |
East Carolina | ECaro | |
Eastern Illinois | EIlln | |
Eastern Kentucky | EKent | |
Eastern Michigan | EMich | |
Eastern Washington | EWash | |
Elon | ElonU | |
Florida | Fla | |
Florida Atlantic | FlAtl | |
Florida International | FlaIn | |
Florida State | FlaSt | |
Fordham | Fordh | |
Fresno State | FrsSt | |
Gardner-Webb | GrdWb | |
Georgia | Geo | |
Georgia Southern | GeoSo | |
Georgia State | GeoSt | |
Georgia Tech | GTech | |
Grambling | Gramb | |
Hawai'i | Hawa | |
Houston | Houst | |
Houston Baptist | HstnB | |
Howard | Howrd | |
Idaho | Idaho | |
Idaho State | IdaSt | |
Illinois | Illin | |
Illinois State | IllSt | |
Indiana | Ind | |
Iowa | Iowa | |
Iowa State | IowSt | |
Jackson State | JckSt | |
James Madison | JMads | |
Kansas | Kans | |
Kansas State | KanSt | |
Kennesaw State | Ksaw | |
Kent State | KntSt | |
Kentucky | Kent | |
LSU | LSU | |
Lafayette | Lafay | |
Lehigh | Lehgh | |
Liberty | Liber | |
Louisiana | LaLaf | |
Louisiana Monroe | LaMon | |
Louisiana Tech | LaTch | |
Louisville | Lvile | |
Maine | Maine | |
Marshall | Marsh | |
Maryland | Mary | |
McNeese | McNSt | |
Memphis | Memph | |
Mercer | Mercr | |
Miami | MiaFl | |
Miami (OH) | MiaOh | |
Michigan | Mich | |
Michigan State | MchSt | |
Middle Tennessee | MTnSt | |
Minnesota | Minn | |
Mississippi State | MisSt | |
Missouri | Misso | |
Missouri State | MoSt | |
Monmouth | MonNJ | |
Murray State | MurrS | |
NC State | NCSt | |
Navy | Navy | |
Nebraska | Neb | |
Nevada | Nevad | |
New Hampshire | NHamp | |
New Mexico | NMex | |
New Mexico State | NMxSt | |
Nicholls | NicSt | |
Norfolk State | NflkS | |
North Carolina | NCaro | |
North Carolina A&T | NCAT | |
North Carolina Central | NCCtl | |
North Texas | NoTex | |
Northern Arizona | NoArz | |
Northern Illinois | NoIll | |
Northern Iowa | NIowa | |
Northwestern | Nwest | |
Northwestern State | NWSt | |
Notre Dame | NDame | |
Ohio | Ohio | |
Ohio State | OhSt | |
Oklahoma | Okla | |
Oklahoma State | OKSt | |
Old Dominion | ODU | |
Ole Miss | Miss | |
Oregon | Oregn | |
Oregon State | OrgSt | |
Penn State | PnSt | |
Pittsburgh | Pitt | |
Portland State | PrtSt | |
Prairie View | PraVw | |
Purdue | Prdue | |
Rhode Island | RIsld | |
Rice | Rice | |
Richmond | Richm | |
Rutgers | Rutgr | |
SMU | SMU | |
San Diego State | SDSt | |
San Jos茅 State | SJSt | |
Savannah State | SavSt | |
South Alabama | SAlab | |
South Carolina | SCaro | |
South Carolina State | SCSt | |
South Dakota | SDako | |
South Florida | SFla | |
Southeast Missouri State | SEMo | |
Southeastern Louisiana | SELa | |
Southern | Sthrn | |
Southern Mississippi | SoMis | |
Southern Utah | SoUth | |
Stanford | Stanf | |
Stephen F. Austin | SFAus | |
Stony Brook | StBrk | |
Syracuse | Syrac | |
TCU | TCU | |
Temple | Temp | |
Tennessee | Tenn | |
Tennessee State | TenSt | |
Tennessee Tech | TnTch | |
Texas | Texas | |
Texas A&M | TexAM | |
Texas Southern | TexSo | |
Texas State | TexSt | |
Texas Tech | TexTc | |
Toledo | Toled | |
Troy | Troy | |
Tulane | Tulan | |
Tulsa | Tulsa | |
UAB | UAB | |
UC Davis | UCDav | |
UCF | UCF | |
UCLA | UCLA | |
UMass | UMass | |
UNLV | UNLV | |
USC | USC | |
UT Martin | TnMar | |
UT San Antonio | TexSA | |
UTEP | UTEP | |
Utah | Utah | |
Utah State | UthSt | |
VMI | VMI | |
Vanderbilt | Vandy | |
Villanova | Villa | |
Virginia | Virg | |
Virginia Tech | VTech | |
Wake Forest | WFrst | |
Washington | Wash | |
Washington State | WshSt | |
Weber State | WebSt | |
West Virginia | WVirg | |
Western Kentucky | WKent | |
Western Michigan | WMich | |
Wisconsin | Wisc | |
Wofford | Woffd | |
Wyoming | Wyom |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import re\n", | |
"import requests\n", | |
"import time\n", | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"import joblib\n", | |
"pd.options.display.max_columns = 999" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here is the process of calculation expected points added (EPA) of College football in 2018 season. \n", | |
"Data is collected from https://collegefootballdata.com/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"drive_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/drives?seasonType=regular&year=2018').json())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"game_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/games?year=2018&seasonType=regular').json())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"game_data['game_id'] = game_data['id']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = pd.merge(drive_data,game_data,on='game_id')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data['drive_id'] = data['id_x']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"play_data = pd.DataFrame()\n", | |
"for i in range(15):\n", | |
" request_df = requests.get('https://api.collegefootballdata.com/plays?seasonType=regular&year=2018&week=' + str(i+1)).json()\n", | |
" time.sleep(2)\n", | |
" play_data = play_data.append(pd.DataFrame(request_df))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)\n", | |
"pbp_data['adjusted_yardline'] = 100*(1-pbp_data['coef']) + (2*pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We would first calculate expected point of play using logistic regression. \n", | |
"\n", | |
"The target variable here is point scored of scoring-drive (e.g. Touchdown, Field Goal, Safety, Defensive TD) and the point scored by opponent's next drive for non-scoring drive(e.g. Punt, Missed FG) ." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x == 'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH' or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD') else 0 )))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data['next_drive_point'] = -data['drive_point'].shift(-1).clip_lower(-2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data.loc[data.drive_point == 0, 'drive_point'] = data['next_drive_point']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pbp_data = pbp_data.merge(data[['drive_id','drive_point','drive_result']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"exclude_playtype = ['Kickoff', 'End Period',\n", | |
" 'Kickoff Return (Offense)',\n", | |
" 'Kickoff Return Touchdown', 'End of Half', 'Defensive 2pt Conversion','Uncategorized', 'End of Game']\n", | |
"\n", | |
"game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)']\n", | |
"\n", | |
"regression_df = pbp_data[~(pbp_data.play_type.isin(exclude_playtype)) & (pbp_data.adjusted_yardline > 0)& (pbp_data.adjusted_yardline < 100) & ~(pbp_data.drive_result.isin(game_end_drive))].dropna()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Gradient boosting classifier from sklearn is used here for expected point calculation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", | |
" learning_rate=0.1, loss='deviance', max_depth=3,\n", | |
" max_features=None, max_leaf_nodes=None,\n", | |
" min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
" min_samples_leaf=1, min_samples_split=2,\n", | |
" min_weight_fraction_leaf=0.0, n_estimators=200,\n", | |
" n_iter_no_change=None, presort='auto',\n", | |
" random_state=None, subsample=1.0, tol=0.0001,\n", | |
" validation_fraction=0.1, verbose=0,\n", | |
" warm_start=False)" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf = GradientBoostingClassifier(n_estimators = 200)\n", | |
"clf.fit(regression_df[['down','distance','adjusted_yardline']], regression_df.drive_point)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Calculation of EPA below is for play from scrimmage only." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']\n", | |
"timing_play_type = ['End Period','End of Game','Timeout','End of Half']\n", | |
"turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception']\n", | |
"regular_play_type = [ 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']\n", | |
"off_TD = ['Passing Touchdown','Rushing Touchdown']\n", | |
"def_TD = ['Interception Return Touchdown','Fumble Return Touchdown']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play = pbp_data[pbp_data.play_type.isin(regular_play_type) | pbp_data.play_type.isin(turnover_play_type) ]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Team abbreviation in play_text is obtained using regex match on the data. Here we just read csv after cleaning up, and match the abbrevation to offense and defense" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"CFB_teams_list = pd.read_csv('cfb_teams_list.csv',encoding='utf-8') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])\n", | |
"regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)\n", | |
"regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])\n", | |
"regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Expected point at the start of the play:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"EP_predict = clf.predict_proba(regular_play[['down','distance','adjusted_yardline']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n", | |
"regular_play['EP_start'] = EP" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Cleaning the data for expected point at the end of the play" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play['new_yardline']= 0\n", | |
"regular_play['new_down']= 0\n", | |
"regular_play['new_distance']= 0\n", | |
"regular_play['turnover'] = 0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Drop missing data and erroneous play type\n", | |
"regular_play = regular_play[~pd.isna(regular_play.play_text) & (regular_play.play_type != 'Interception')] " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_down'] = 1\n", | |
"regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_distance'] = 10\n", | |
"\n", | |
"regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_down'] = 1\n", | |
"regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_distance'] = 10\n", | |
"\n", | |
"regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_down'] = regular_play.down + 1\n", | |
"regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_distance'] = regular_play.distance - regular_play.yards_gained\n", | |
"\n", | |
"regular_play.loc[regular_play.play_text.str.contains('50 yard line'), 'new_yardline'] = 50\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_yardline'] = 100- (regular_play.yard_line + regular_play.yards_gained) \n", | |
"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_down'] = 1\n", | |
"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_distance'] = 10\n", | |
"\n", | |
"regular_play.loc[regular_play.play_type == 'Sack', 'new_yardline'] = 100- (regular_play.yard_line - regular_play.yards_gained)\n", | |
"regular_play.loc[regular_play.play_type == 'Sack', 'new_down'] = regular_play.down + 1\n", | |
"regular_play.loc[regular_play.play_type == 'Sack', 'new_distance'] = regular_play.distance - regular_play.yards_gained" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
" after removing the cwd from sys.path.\n", | |
"C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
" \n" | |
] | |
} | |
], | |
"source": [ | |
"#Collect end of play yardline information (e.g. Alab 38 = Alabama own 38) from play_text and match the team abbreviation\n", | |
"\n", | |
"temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.off_abbr.values.astype(str)) >= 0] \n", | |
"temp_df['split_string'] = [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.off_abbr.values.astype(str)))]\n", | |
"regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = 100-np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()\n", | |
"\n", | |
"temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.def_abbr.values.astype(str)) >= 0]\n", | |
"temp_df['split_string'] = [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.def_abbr.values.astype(str)))]\n", | |
"regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[pd.isna(regular_play.new_yardline),'new_distance'] = regular_play.distance - regular_play.yards_gained \n", | |
"regular_play.loc[pd.isna(regular_play.new_yardline),'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained\n", | |
"\n", | |
"regular_play.loc[regular_play.play_type == 'Pass Incompletion', 'new_yardline'] = regular_play.adjusted_yardline\n", | |
"\n", | |
"regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_yardline'] = 80\n", | |
"regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_down'] = 1\n", | |
"\n", | |
"#Fake data for model prediction, EP will be changed after processing the data\n", | |
"\n", | |
"regular_play.loc[regular_play.play_type == 'Safety', 'new_yardline'] = 99 #Fake yardline for Safety\n", | |
"\n", | |
"regular_play.loc[regular_play.play_type.isin(off_TD),'new_down'] = 1 #Fake new down for Offensive tocuhdown play\n", | |
"regular_play.loc[regular_play.play_type.isin(off_TD),'new_distance'] = 10 #Fake new yards to go for Offensive tocuhdown play\n", | |
"\n", | |
"regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD)),'new_yardline'] = 99 #Fake yardline for Offensive tocuhdown play\n", | |
"\n", | |
"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'turnover'] = 1 #Turnover on down\n", | |
"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_down'] = 1 \n", | |
"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_distance'] = 10\n", | |
"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_yardline'] = 100-regular_play.new_yardline\n", | |
"\n", | |
"\n", | |
"regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_down' ] = 1 #Strip sack\n", | |
"regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_distance' ] = 10 \n", | |
"regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_text.str.contains('return')), 'new_yardline' ] = 100-(regular_play.adjusted_yardline - regular_play.yards_gained)\n", | |
"regular_play.loc[ regular_play.play_text.str.contains('return'), 'turnover' ] = 1 \n", | |
"\n", | |
"regular_play.loc[regular_play.new_distance <= 0, 'new_down'] = 1 #First down not in API\n", | |
"regular_play.loc[regular_play.new_distance <= 0, 'new_distance'] = 10" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[regular_play.new_yardline <= 0 ,'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[regular_play.play_text.str.contains('TOUCHDOWN'),'new_yardline'] = 99" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Throw away some plays with error" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play = regular_play[(regular_play.new_yardline > 0) & (regular_play.new_yardline < 100) & (regular_play.adjusted_yardline > 0) & (regular_play.adjusted_yardline < 100)]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Calculate expected point at the end of play. Since statsmodels take column name input in prediction we first extract and rename feature columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"out_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline']})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"EP_predict = clf.predict_proba(out_df[['down','distance','adjusted_yardline']])\n", | |
"EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play['EP_end'] = EP" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Finally setting the expected point at end of touchdown and safety play to 7 and -2, and reverse the number for turnover plays" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD) | regular_play.play_text.str.contains('TOUCHDOWN')),'EP_end'] = 7" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[(regular_play.play_type.isin(turnover_play_type)| regular_play.turnover == 1),'EP_end'] *= -1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pass_play_type = ['Sack','Pass Incompletion','Pass Interception Return','Pass Reception','Interception Return Touchdown','Passing Touchdown','Pass Completion','Pass Interception']\n", | |
"rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Check EPA by play type" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.03542604861003451" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"-0.02507176193772802" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"play_type\n", | |
"Fumble Recovery (Opponent) -4.600877\n", | |
"Fumble Recovery (Own) -0.935820\n", | |
"Fumble Return Touchdown -7.319282\n", | |
"Interception Return Touchdown -7.766629\n", | |
"Pass Incompletion -1.104547\n", | |
"Pass Interception Return -3.544499\n", | |
"Pass Reception 0.912590\n", | |
"Passing Touchdown 3.536264\n", | |
"Rush -0.071102\n", | |
"Rushing Touchdown 2.217327\n", | |
"Sack -1.874424\n", | |
"Safety -0.429088\n", | |
"Name: EPA, dtype: float64" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"regular_play.groupby('play_type')['EPA'].mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"regular_play.to_csv('CFB_regular_play_18.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment