903124 · December 18, 2023 19:21
diff --git a/cfb_teams_list.csv b/cfb_teams_list.csv
diff --git a/EPA_CFB.ipynb b/EPA_CFB.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import re\n",
    "import requests\n",
    "import time\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "import joblib\n",
    "pd.options.display.max_columns = 999"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is the process of calculation expected points added (EPA) of College football in 2018 season. \n",
    "Data is collected from https://collegefootballdata.com/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "drive_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/drives?seasonType=regular&year=2018').json())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "game_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/games?year=2018&seasonType=regular').json())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "game_data['game_id'] = game_data['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(drive_data,game_data,on='game_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['drive_id'] = data['id_x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "play_data = pd.DataFrame()\n",
    "for i in range(15):\n",
    "    request_df = requests.get('https://api.collegefootballdata.com/plays?seasonType=regular&year=2018&week=' + str(i+1)).json()\n",
    "    time.sleep(2)\n",
    "    play_data = play_data.append(pd.DataFrame(request_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)\n",
    "pbp_data['adjusted_yardline'] = 100*(1-pbp_data['coef']) +  (2*pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We would first calculate expected point of play using logistic regression. \n",
    "\n",
    "The target variable here is point scored of scoring-drive (e.g. Touchdown, Field Goal, Safety, Defensive TD) and the point scored by opponent's next drive for non-scoring drive(e.g. Punt, Missed FG) ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x ==   'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH'  or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD') else 0 )))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['next_drive_point'] = -data['drive_point'].shift(-1).clip_lower(-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.drive_point == 0, 'drive_point'] = data['next_drive_point']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbp_data = pbp_data.merge(data[['drive_id','drive_point','drive_result']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "exclude_playtype = ['Kickoff',  'End Period',\n",
    "        'Kickoff Return (Offense)',\n",
    "       'Kickoff Return Touchdown', 'End of Half', 'Defensive 2pt Conversion','Uncategorized', 'End of Game']\n",
    "\n",
    "game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)']\n",
    "\n",
    "regression_df = pbp_data[~(pbp_data.play_type.isin(exclude_playtype)) & (pbp_data.adjusted_yardline > 0)& (pbp_data.adjusted_yardline < 100) & ~(pbp_data.drive_result.isin(game_end_drive))].dropna()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gradient boosting classifier from sklearn is used here for expected point calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "                           learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "                           max_features=None, max_leaf_nodes=None,\n",
       "                           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                           min_samples_leaf=1, min_samples_split=2,\n",
       "                           min_weight_fraction_leaf=0.0, n_estimators=200,\n",
       "                           n_iter_no_change=None, presort='auto',\n",
       "                           random_state=None, subsample=1.0, tol=0.0001,\n",
       "                           validation_fraction=0.1, verbose=0,\n",
       "                           warm_start=False)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = GradientBoostingClassifier(n_estimators = 200)\n",
    "clf.fit(regression_df[['down','distance','adjusted_yardline']], regression_df.drive_point)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculation of EPA below is for play from scrimmage only."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']\n",
    "timing_play_type = ['End Period','End of Game','Timeout','End of Half']\n",
    "turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception']\n",
    "regular_play_type = [ 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']\n",
    "off_TD = ['Passing Touchdown','Rushing Touchdown']\n",
    "def_TD = ['Interception Return Touchdown','Fumble Return Touchdown']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play = pbp_data[pbp_data.play_type.isin(regular_play_type) | pbp_data.play_type.isin(turnover_play_type) ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Team abbreviation in play_text is obtained using regex match on the data. Here we just read csv after cleaning up, and match the abbrevation to offense and defense"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "CFB_teams_list = pd.read_csv('cfb_teams_list.csv',encoding='utf-8') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])\n",
    "regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)\n",
    "regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])\n",
    "regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expected point at the start of the play:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "EP_predict = clf.predict_proba(regular_play[['down','distance','adjusted_yardline']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n",
    "regular_play['EP_start'] = EP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cleaning the data for expected point at the end of the play"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play['new_yardline']= 0\n",
    "regular_play['new_down']= 0\n",
    "regular_play['new_distance']= 0\n",
    "regular_play['turnover'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Drop missing data and erroneous play type\n",
    "regular_play = regular_play[~pd.isna(regular_play.play_text) & (regular_play.play_type != 'Interception')] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_down'] = 1\n",
    "regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_distance'] = 10\n",
    "\n",
    "regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_down'] = 1\n",
    "regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_distance'] = 10\n",
    "\n",
    "regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_down'] = regular_play.down + 1\n",
    "regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_distance'] = regular_play.distance - regular_play.yards_gained\n",
    "\n",
    "regular_play.loc[regular_play.play_text.str.contains('50 yard line'), 'new_yardline'] = 50\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_yardline'] = 100- (regular_play.yard_line + regular_play.yards_gained) \n",
    "regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_down'] = 1\n",
    "regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_distance'] = 10\n",
    "\n",
    "regular_play.loc[regular_play.play_type == 'Sack', 'new_yardline'] = 100- (regular_play.yard_line - regular_play.yards_gained)\n",
    "regular_play.loc[regular_play.play_type == 'Sack', 'new_down'] = regular_play.down + 1\n",
    "regular_play.loc[regular_play.play_type == 'Sack', 'new_distance'] = regular_play.distance - regular_play.yards_gained"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  after removing the cwd from sys.path.\n",
      "C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "#Collect end of play yardline information (e.g. Alab 38 = Alabama own 38) from play_text and match the team abbreviation\n",
    "\n",
    "temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.off_abbr.values.astype(str)) >= 0] \n",
    "temp_df['split_string'] =  [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.off_abbr.values.astype(str)))]\n",
    "regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = 100-np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()\n",
    "\n",
    "temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.def_abbr.values.astype(str)) >= 0]\n",
    "temp_df['split_string'] =  [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.def_abbr.values.astype(str)))]\n",
    "regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[pd.isna(regular_play.new_yardline),'new_distance'] = regular_play.distance - regular_play.yards_gained \n",
    "regular_play.loc[pd.isna(regular_play.new_yardline),'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained\n",
    "\n",
    "regular_play.loc[regular_play.play_type == 'Pass Incompletion', 'new_yardline'] = regular_play.adjusted_yardline\n",
    "\n",
    "regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_yardline'] = 80\n",
    "regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_down'] = 1\n",
    "\n",
    "#Fake data for model prediction, EP will be changed after processing the data\n",
    "\n",
    "regular_play.loc[regular_play.play_type == 'Safety', 'new_yardline'] = 99 #Fake yardline for Safety\n",
    "\n",
    "regular_play.loc[regular_play.play_type.isin(off_TD),'new_down'] = 1 #Fake new down for Offensive tocuhdown play\n",
    "regular_play.loc[regular_play.play_type.isin(off_TD),'new_distance']  = 10 #Fake new yards to go for Offensive tocuhdown play\n",
    "\n",
    "regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD)),'new_yardline'] = 99  #Fake yardline for Offensive tocuhdown play\n",
    "\n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'turnover'] = 1 #Turnover on down\n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_down'] = 1 \n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_distance'] = 10\n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_yardline'] = 100-regular_play.new_yardline\n",
    "\n",
    "\n",
    "regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0))  & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_down' ] = 1 #Strip sack\n",
    "regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_distance' ] = 10 \n",
    "regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_text.str.contains('return')), 'new_yardline' ] = 100-(regular_play.adjusted_yardline - regular_play.yards_gained)\n",
    "regular_play.loc[ regular_play.play_text.str.contains('return'), 'turnover' ] = 1 \n",
    "\n",
    "regular_play.loc[regular_play.new_distance <= 0, 'new_down'] = 1 #First down not in API\n",
    "regular_play.loc[regular_play.new_distance <= 0, 'new_distance'] = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.new_yardline <= 0 ,'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_text.str.contains('TOUCHDOWN'),'new_yardline'] = 99"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Throw away some plays with error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play = regular_play[(regular_play.new_yardline > 0) & (regular_play.new_yardline < 100) & (regular_play.adjusted_yardline > 0) & (regular_play.adjusted_yardline < 100)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate expected point at the end of play. Since statsmodels take column name input in prediction we first extract and rename feature columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "EP_predict = clf.predict_proba(out_df[['down','distance','adjusted_yardline']])\n",
    "EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play['EP_end'] = EP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally setting the expected point at end of touchdown and safety play to 7 and -2, and reverse the number for turnover plays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD) | regular_play.play_text.str.contains('TOUCHDOWN')),'EP_end'] = 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[(regular_play.play_type.isin(turnover_play_type)| regular_play.turnover == 1),'EP_end'] *= -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "pass_play_type = ['Sack','Pass Incompletion','Pass Interception Return','Pass Reception','Interception Return Touchdown','Passing Touchdown','Pass Completion','Pass Interception']\n",
    "rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check EPA by play type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.03542604861003451"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.02507176193772802"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "play_type\n",
       "Fumble Recovery (Opponent)      -4.600877\n",
       "Fumble Recovery (Own)           -0.935820\n",
       "Fumble Return Touchdown         -7.319282\n",
       "Interception Return Touchdown   -7.766629\n",
       "Pass Incompletion               -1.104547\n",
       "Pass Interception Return        -3.544499\n",
       "Pass Reception                   0.912590\n",
       "Passing Touchdown                3.536264\n",
       "Rush                            -0.071102\n",
       "Rushing Touchdown                2.217327\n",
       "Sack                            -1.874424\n",
       "Safety                          -0.429088\n",
       "Name: EPA, dtype: float64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regular_play.groupby('play_type')['EPA'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.to_csv('CFB_regular_play_18.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	full_name	abbreviation
	Abilene Christian	AblCh
	Air Force	AFA
	Akron	Akron
	Alabama	Alab
	Alabama A&M	AlaAM
	Albany	Alban
	Alcorn State	AlcSt
	Appalachian State	AppSt
	Arizona	Ariz
	Arizona State	ArzSt
	Arkansas	Ark
	Arkansas State	ArkSt
	Arkansas-Pine Bluff	ArkPB
	Army	Army
	Auburn	Aub
	Austin Peay	APeay
	BYU	BYU
	Ball State	BalSt
	Baylor	Bayl
	Bethune-Cookman	BthCk
	Boise State	BoiSt
	Boston College	BC
	Bowling Green	BwGrn
	Buffalo	Buff
	California	Cal
	Campbell	Camp
	Central Arkansas	CArk
	Central Connecticut	CConn
	Central Michigan	CMich
	Charleston Southern	ChaSo
	Charlotte	Charl
	Cincinnati	Cincy
	Clemson	Clem
	Coastal Carolina	CCaro
	Colorado	Colo
	Colorado State	ColSt
	Connecticut	UConn
	Delaware State	DelSt
	Drake	Drake
	Duke	Duke
	Duquesne	Duqsn
	East Carolina	ECaro
	Eastern Illinois	EIlln
	Eastern Kentucky	EKent
	Eastern Michigan	EMich
	Eastern Washington	EWash
	Elon	ElonU
	Florida	Fla
	Florida Atlantic	FlAtl
	Florida International	FlaIn
	Florida State	FlaSt
	Fordham	Fordh
	Fresno State	FrsSt
	Gardner-Webb	GrdWb
	Georgia	Geo
	Georgia Southern	GeoSo
	Georgia State	GeoSt
	Georgia Tech	GTech
	Grambling	Gramb
	Hawai'i	Hawa
	Houston	Houst
	Houston Baptist	HstnB
	Howard	Howrd
	Idaho	Idaho
	Idaho State	IdaSt
	Illinois	Illin
	Illinois State	IllSt
	Indiana	Ind
	Iowa	Iowa
	Iowa State	IowSt
	Jackson State	JckSt
	James Madison	JMads
	Kansas	Kans
	Kansas State	KanSt
	Kennesaw State	Ksaw
	Kent State	KntSt
	Kentucky	Kent
	LSU	LSU
	Lafayette	Lafay
	Lehigh	Lehgh
	Liberty	Liber
	Louisiana	LaLaf
	Louisiana Monroe	LaMon
	Louisiana Tech	LaTch
	Louisville	Lvile
	Maine	Maine
	Marshall	Marsh
	Maryland	Mary
	McNeese	McNSt
	Memphis	Memph
	Mercer	Mercr
	Miami	MiaFl
	Miami (OH)	MiaOh
	Michigan	Mich
	Michigan State	MchSt
	Middle Tennessee	MTnSt
	Minnesota	Minn
	Mississippi State	MisSt
	Missouri	Misso
	Missouri State	MoSt
	Monmouth	MonNJ
	Murray State	MurrS
	NC State	NCSt
	Navy	Navy
	Nebraska	Neb
	Nevada	Nevad
	New Hampshire	NHamp
	New Mexico	NMex
	New Mexico State	NMxSt
	Nicholls	NicSt
	Norfolk State	NflkS
	North Carolina	NCaro
	North Carolina A&T	NCAT
	North Carolina Central	NCCtl
	North Texas	NoTex
	Northern Arizona	NoArz
	Northern Illinois	NoIll
	Northern Iowa	NIowa
	Northwestern	Nwest
	Northwestern State	NWSt
	Notre Dame	NDame
	Ohio	Ohio
	Ohio State	OhSt
	Oklahoma	Okla
	Oklahoma State	OKSt
	Old Dominion	ODU
	Ole Miss	Miss
	Oregon	Oregn
	Oregon State	OrgSt
	Penn State	PnSt
	Pittsburgh	Pitt
	Portland State	PrtSt
	Prairie View	PraVw
	Purdue	Prdue
	Rhode Island	RIsld
	Rice	Rice
	Richmond	Richm
	Rutgers	Rutgr
	SMU	SMU
	San Diego State	SDSt
	San Jos茅 State	SJSt
	Savannah State	SavSt
	South Alabama	SAlab
	South Carolina	SCaro
	South Carolina State	SCSt
	South Dakota	SDako
	South Florida	SFla
	Southeast Missouri State	SEMo
	Southeastern Louisiana	SELa
	Southern	Sthrn
	Southern Mississippi	SoMis
	Southern Utah	SoUth
	Stanford	Stanf
	Stephen F. Austin	SFAus
	Stony Brook	StBrk
	Syracuse	Syrac
	TCU	TCU
	Temple	Temp
	Tennessee	Tenn
	Tennessee State	TenSt
	Tennessee Tech	TnTch
	Texas	Texas
	Texas A&M	TexAM
	Texas Southern	TexSo
	Texas State	TexSt
	Texas Tech	TexTc
	Toledo	Toled
	Troy	Troy
	Tulane	Tulan
	Tulsa	Tulsa
	UAB	UAB
	UC Davis	UCDav
	UCF	UCF
	UCLA	UCLA
	UMass	UMass
	UNLV	UNLV
	USC	USC
	UT Martin	TnMar
	UT San Antonio	TexSA
	UTEP	UTEP
	Utah	Utah
	Utah State	UthSt
	VMI	VMI
	Vanderbilt	Vandy
	Villanova	Villa
	Virginia	Virg
	Virginia Tech	VTech
	Wake Forest	WFrst
	Washington	Wash
	Washington State	WshSt
	Weber State	WebSt
	West Virginia	WVirg
	Western Kentucky	WKent
	Western Michigan	WMich
	Wisconsin	Wisc
	Wofford	Woffd
	Wyoming	Wyom
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import re\n",
	"import requests\n",
	"import time\n",
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"import joblib\n",
	"pd.options.display.max_columns = 999"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Here is the process of calculation expected points added (EPA) of College football in 2018 season. \n",
	"Data is collected from https://collegefootballdata.com/"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"drive_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/drives?seasonType=regular&year=2018').json())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"game_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/games?year=2018&seasonType=regular').json())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"game_data['game_id'] = game_data['id']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = pd.merge(drive_data,game_data,on='game_id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"data['drive_id'] = data['id_x']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"play_data = pd.DataFrame()\n",
	"for i in range(15):\n",
	" request_df = requests.get('https://api.collegefootballdata.com/plays?seasonType=regular&year=2018&week=' + str(i+1)).json()\n",
	" time.sleep(2)\n",
	" play_data = play_data.append(pd.DataFrame(request_df))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)\n",
	"pbp_data['adjusted_yardline'] = 100(1-pbp_data['coef']) + (2pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We would first calculate expected point of play using logistic regression. \n",
	"\n",
	"The target variable here is point scored of scoring-drive (e.g. Touchdown, Field Goal, Safety, Defensive TD) and the point scored by opponent's next drive for non-scoring drive(e.g. Punt, Missed FG) ."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x == 'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH' or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD') else 0 )))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"data['next_drive_point'] = -data['drive_point'].shift(-1).clip_lower(-2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"data.loc[data.drive_point == 0, 'drive_point'] = data['next_drive_point']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"pbp_data = pbp_data.merge(data[['drive_id','drive_point','drive_result']])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"exclude_playtype = ['Kickoff', 'End Period',\n",
	" 'Kickoff Return (Offense)',\n",
	" 'Kickoff Return Touchdown', 'End of Half', 'Defensive 2pt Conversion','Uncategorized', 'End of Game']\n",
	"\n",
	"game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)']\n",
	"\n",
	"regression_df = pbp_data[~(pbp_data.play_type.isin(exclude_playtype)) & (pbp_data.adjusted_yardline > 0)& (pbp_data.adjusted_yardline < 100) & ~(pbp_data.drive_result.isin(game_end_drive))].dropna()\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Gradient boosting classifier from sklearn is used here for expected point calculation"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
	" learning_rate=0.1, loss='deviance', max_depth=3,\n",
	" max_features=None, max_leaf_nodes=None,\n",
	" min_impurity_decrease=0.0, min_impurity_split=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=200,\n",
	" n_iter_no_change=None, presort='auto',\n",
	" random_state=None, subsample=1.0, tol=0.0001,\n",
	" validation_fraction=0.1, verbose=0,\n",
	" warm_start=False)"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf = GradientBoostingClassifier(n_estimators = 200)\n",
	"clf.fit(regression_df[['down','distance','adjusted_yardline']], regression_df.drive_point)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Calculation of EPA below is for play from scrimmage only."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']\n",
	"timing_play_type = ['End Period','End of Game','Timeout','End of Half']\n",
	"turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception']\n",
	"regular_play_type = [ 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']\n",
	"off_TD = ['Passing Touchdown','Rushing Touchdown']\n",
	"def_TD = ['Interception Return Touchdown','Fumble Return Touchdown']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play = pbp_data[pbp_data.play_type.isin(regular_play_type) \| pbp_data.play_type.isin(turnover_play_type) ]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Team abbreviation in play_text is obtained using regex match on the data. Here we just read csv after cleaning up, and match the abbrevation to offense and defense"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"CFB_teams_list = pd.read_csv('cfb_teams_list.csv',encoding='utf-8') "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])\n",
	"regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)\n",
	"regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])\n",
	"regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Expected point at the start of the play:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"EP_predict = clf.predict_proba(regular_play[['down','distance','adjusted_yardline']])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n",
	"regular_play['EP_start'] = EP"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Cleaning the data for expected point at the end of the play"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play['new_yardline']= 0\n",
	"regular_play['new_down']= 0\n",
	"regular_play['new_distance']= 0\n",
	"regular_play['turnover'] = 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Drop missing data and erroneous play type\n",
	"regular_play = regular_play[~pd.isna(regular_play.play_text) & (regular_play.play_type != 'Interception')] "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_down'] = 1\n",
	"regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_distance'] = 10\n",
	"\n",
	"regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_down'] = 1\n",
	"regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_distance'] = 10\n",
	"\n",
	"regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_down'] = regular_play.down + 1\n",
	"regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_distance'] = regular_play.distance - regular_play.yards_gained\n",
	"\n",
	"regular_play.loc[regular_play.play_text.str.contains('50 yard line'), 'new_yardline'] = 50\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_yardline'] = 100- (regular_play.yard_line + regular_play.yards_gained) \n",
	"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_down'] = 1\n",
	"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_distance'] = 10\n",
	"\n",
	"regular_play.loc[regular_play.play_type == 'Sack', 'new_yardline'] = 100- (regular_play.yard_line - regular_play.yards_gained)\n",
	"regular_play.loc[regular_play.play_type == 'Sack', 'new_down'] = regular_play.down + 1\n",
	"regular_play.loc[regular_play.play_type == 'Sack', 'new_distance'] = regular_play.distance - regular_play.yards_gained"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame.\n",
	"Try using .loc[row_indexer,col_indexer] = value instead\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" after removing the cwd from sys.path.\n",
	"C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame.\n",
	"Try using .loc[row_indexer,col_indexer] = value instead\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" \n"
	]
	}
	],
	"source": [
	"#Collect end of play yardline information (e.g. Alab 38 = Alabama own 38) from play_text and match the team abbreviation\n",
	"\n",
	"temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.off_abbr.values.astype(str)) >= 0] \n",
	"temp_df['split_string'] = [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.off_abbr.values.astype(str)))]\n",
	"regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = 100-np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()\n",
	"\n",
	"temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.def_abbr.values.astype(str)) >= 0]\n",
	"temp_df['split_string'] = [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.def_abbr.values.astype(str)))]\n",
	"regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[pd.isna(regular_play.new_yardline),'new_distance'] = regular_play.distance - regular_play.yards_gained \n",
	"regular_play.loc[pd.isna(regular_play.new_yardline),'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained\n",
	"\n",
	"regular_play.loc[regular_play.play_type == 'Pass Incompletion', 'new_yardline'] = regular_play.adjusted_yardline\n",
	"\n",
	"regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_yardline'] = 80\n",
	"regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_down'] = 1\n",
	"\n",
	"#Fake data for model prediction, EP will be changed after processing the data\n",
	"\n",
	"regular_play.loc[regular_play.play_type == 'Safety', 'new_yardline'] = 99 #Fake yardline for Safety\n",
	"\n",
	"regular_play.loc[regular_play.play_type.isin(off_TD),'new_down'] = 1 #Fake new down for Offensive tocuhdown play\n",
	"regular_play.loc[regular_play.play_type.isin(off_TD),'new_distance'] = 10 #Fake new yards to go for Offensive tocuhdown play\n",
	"\n",
	"regular_play.loc[(regular_play.play_type.isin(off_TD) \| regular_play.play_type.isin(def_TD)),'new_yardline'] = 99 #Fake yardline for Offensive tocuhdown play\n",
	"\n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'turnover'] = 1 #Turnover on down\n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_down'] = 1 \n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_distance'] = 10\n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_yardline'] = 100-regular_play.new_yardline\n",
	"\n",
	"\n",
	"regular_play.loc[((regular_play.new_yardline <= 0) \|(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_down' ] = 1 #Strip sack\n",
	"regular_play.loc[((regular_play.new_yardline <= 0) \|(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_distance' ] = 10 \n",
	"regular_play.loc[((regular_play.new_yardline <= 0) \|(regular_play.new_distance <= 0)) & (regular_play.play_text.str.contains('return')), 'new_yardline' ] = 100-(regular_play.adjusted_yardline - regular_play.yards_gained)\n",
	"regular_play.loc[ regular_play.play_text.str.contains('return'), 'turnover' ] = 1 \n",
	"\n",
	"regular_play.loc[regular_play.new_distance <= 0, 'new_down'] = 1 #First down not in API\n",
	"regular_play.loc[regular_play.new_distance <= 0, 'new_distance'] = 10"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.new_yardline <= 0 ,'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_text.str.contains('TOUCHDOWN'),'new_yardline'] = 99"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Throw away some plays with error"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play = regular_play[(regular_play.new_yardline > 0) & (regular_play.new_yardline < 100) & (regular_play.adjusted_yardline > 0) & (regular_play.adjusted_yardline < 100)]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Calculate expected point at the end of play. Since statsmodels take column name input in prediction we first extract and rename feature columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [],
	"source": [
	"out_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline']})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [],
	"source": [
	"EP_predict = clf.predict_proba(out_df[['down','distance','adjusted_yardline']])\n",
	"EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play['EP_end'] = EP"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Finally setting the expected point at end of touchdown and safety play to 7 and -2, and reverse the number for turnover plays"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[(regular_play.play_type.isin(off_TD) \| regular_play.play_type.isin(def_TD) \| regular_play.play_text.str.contains('TOUCHDOWN')),'EP_end'] = 7"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[(regular_play.play_type.isin(turnover_play_type)\| regular_play.turnover == 1),'EP_end'] *= -1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {},
	"outputs": [],
	"source": [
	"pass_play_type = ['Sack','Pass Incompletion','Pass Interception Return','Pass Reception','Interception Return Touchdown','Passing Touchdown','Pass Completion','Pass Interception']\n",
	"rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Check EPA by play type"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.03542604861003451"
	]
	},
	"execution_count": 39,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"-0.02507176193772802"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"play_type\n",
	"Fumble Recovery (Opponent) -4.600877\n",
	"Fumble Recovery (Own) -0.935820\n",
	"Fumble Return Touchdown -7.319282\n",
	"Interception Return Touchdown -7.766629\n",
	"Pass Incompletion -1.104547\n",
	"Pass Interception Return -3.544499\n",
	"Pass Reception 0.912590\n",
	"Passing Touchdown 3.536264\n",
	"Rush -0.071102\n",
	"Rushing Touchdown 2.217327\n",
	"Sack -1.874424\n",
	"Safety -0.429088\n",
	"Name: EPA, dtype: float64"
	]
	},
	"execution_count": 41,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"regular_play.groupby('play_type')['EPA'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.to_csv('CFB_regular_play_18.csv')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}