Last active
November 3, 2024 18:42
-
-
Save 903124/0cf68236b200c0f9af6518cfd744386d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pybaseball\n", | |
"import requests\n", | |
"import requests_cache\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import math\n", | |
"from sklearn.externals import joblib\n", | |
"from scipy import stats\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from datetime import datetime\n", | |
"import json\n", | |
"import psycopg2\n", | |
"import sqlalchemy\n", | |
"from bs4 import BeautifulSoup\n", | |
"import urllib.request\n", | |
"import csv\n", | |
"import heapq\n", | |
"np.set_printoptions(suppress=True)\n", | |
"np.set_printoptions(precision=2)\n", | |
"\n", | |
"\n", | |
"\n", | |
"team_array = ['LAA','ARI','ATL','BAL','BOS','CWS','CHC','CIN','CLE','COL','DET','MIA','HOU','KC','LAD','MIL','MIN','NYY','NYM','OAK','PHI','PIT','SD','SEA','SF','STL','TB','TEX','TOR','WSH']\n", | |
"altitude_array = [151,1086,930,33,21,595,595,535,653,5190,600,10,45,865,515,597,815,55,10,3,20,780,23,10,0,460,15,545,270,35]\n", | |
"\n", | |
"\n", | |
"\n", | |
"def train_data(statcast_data):\n", | |
" \n", | |
" pos_id_range = []\n", | |
" pos_id_dp = []\n", | |
" pos_id_dp_pivot = []\n", | |
" pos_id_arm = []\n", | |
" \n", | |
" catch_result = []\n", | |
" train_parameter_range = [] #exit speed, exit angle, theta, altitude, handness, shift, date, 1b,2b,3b ,out\n", | |
"\n", | |
" dp_result = []\n", | |
" train_parameter_dp = [] #exit speed, exit angle, hc_x, hc_y, altitude, handness, shift, date, on_2b, on_3b\n", | |
"\n", | |
" arm_result = []\n", | |
" train_parameter_arm = [] #exit speed, exit angle, hc_x, hc_y, park, handness, shift, date,condition\n", | |
" extra_base_case_count = []\n", | |
" kill_id = []\n", | |
"\n", | |
" for count,item in enumerate(statcast_data['des']):\n", | |
" if(count % 10000 == 0):\n", | |
" print(count)\n", | |
"\n", | |
" if item is None:\n", | |
" continue\n", | |
"\n", | |
" # error = item.find('error')\n", | |
" # if(error != -1):\n", | |
" # continue\n", | |
" # bunt = item.find('bunt')\n", | |
" # if(bunt != -1):\n", | |
" # bunt_indicator = 1\n", | |
" # if(len(pos_catch_array) == 0 ):\n", | |
" # continue\n", | |
" # if(statcast_2015_data['bb_type'][count] == 'fly_ball' or statcast_2015_data['bb_type'][count] == 'popup' ):\n", | |
" # continue\n", | |
" # if(min(pos_catch_array) == pos_p or min(pos_catch_array) == pos_c):\n", | |
" # continue \n", | |
" launch_speed_data = statcast_data['launch_speed'][count] \n", | |
" launch_angle_data = statcast_data['launch_angle'][count] \n", | |
"\n", | |
" on_1b = 1\n", | |
" on_2b = 1\n", | |
" on_3b = 1\n", | |
"\n", | |
" if(pd.isna(statcast_data['on_1b'][count])):\n", | |
" on_1b = 0\n", | |
" if(pd.isna(statcast_data['on_2b'][count])):\n", | |
" on_2b = 0\n", | |
" if(pd.isna(statcast_data['on_3b'][count])):\n", | |
" on_3b = 0 \n", | |
"\n", | |
" out_at_2nd = item.find('out at 2nd') \n", | |
" out_at_3rd = item.find('out at 3rd') \n", | |
" out_at_home = item.find('out at home') \n", | |
"\n", | |
" to_2nd = item.find('to 2nd') \n", | |
" to_3rd = item.find('to 3rd') \n", | |
" scores = item.find('scores') \n", | |
"\n", | |
" game_day = statcast_data['game_date'][count].dayofyear\n", | |
"\n", | |
" hit_location = statcast_data['hit_location'][count]\n", | |
" events = statcast_data['events'][count]\n", | |
"\n", | |
" altitude = 0\n", | |
" park = 0\n", | |
" for i, team in enumerate(team_array):\n", | |
"\n", | |
" if(statcast_data['home_team'][count] == team):\n", | |
" altitude = altitude_array[i]\n", | |
" park = i \n", | |
" # infield_height = baseball_tracjectory_sim(launch_speed_data,launch_angle_data,altitude)\n", | |
" theta = math.degrees(math.atan((128-statcast_data['hc_x'][count])/(204-statcast_data['hc_y'][count])))\n", | |
"\n", | |
"\n", | |
" hc_x = 128-statcast_data['hc_x'][count]\n", | |
" hc_y = 204-statcast_data['hc_y'][count]\n", | |
" # if(theta > 55):\n", | |
" # continue\n", | |
" if(statcast_data['stand'][count] == 'L'):\n", | |
" handness = 1\n", | |
" else:\n", | |
" handness = 0\n", | |
"\n", | |
" if(statcast_data['if_fielding_alignment'][count] == 'Standard'):\n", | |
" shift = 0\n", | |
" elif(statcast_data['if_fielding_alignment'][count] == 'Strategic'):\n", | |
" shift = 1 \n", | |
" else:\n", | |
" shift = 2\n", | |
"\n", | |
" out_when_up = statcast_data['outs_when_up'][count]\n", | |
"\n", | |
"\n", | |
"\n", | |
" train_parameter_range.append([launch_speed_data, launch_angle_data, theta, altitude, handness, shift, game_day,on_1b, on_2b, on_3b, out_when_up])\n", | |
"\n", | |
" lineup_id = []\n", | |
" for i in range(9):\n", | |
"\n", | |
" lineup_id.append(statcast_data['pos' + str(i+1) + '_person_id'][count])\n", | |
"\n", | |
" pos_id_range.append(lineup_id)\n", | |
"\n", | |
" if(pd.isna(hit_location)):\n", | |
" catch_result.append(0)\n", | |
" else:\n", | |
" catch_result.append(hit_location)\n", | |
"\n", | |
" dp_indicator = 0 \n", | |
" if(hit_location >= 0 and to_2nd != -1): \n", | |
" dp_indicator = 1\n", | |
"\n", | |
"\n", | |
" try:\n", | |
" pos_ss = item.find('shortstop')\n", | |
" except AttributeError:\n", | |
" continue\n", | |
" pos_1b = item.find('first baseman')\n", | |
" pos_2b = item.find('second baseman')\n", | |
" pos_3b = item.find('third baseman')\n", | |
" pos_p = item.find('pitcher')\n", | |
" pos_lf = item.find('left fielder')\n", | |
" pos_cf = item.find('center fielder')\n", | |
" pos_rf = item.find('right fielder')\n", | |
" pos_c = item.find('catcher')\n", | |
"\n", | |
"\n", | |
" pos_array = np.array([pos_p,pos_c,pos_1b,pos_2b,pos_3b,pos_ss,pos_lf,pos_cf,pos_rf])\n", | |
" pos_catch_array = pos_array[np.where(pos_array>0)]\n", | |
"\n", | |
"\n", | |
" if(len(pos_catch_array) == 0 ):\n", | |
" continue\n", | |
"\n", | |
" # dp_indicator = 0 \n", | |
" # if(hit_location >= 0 and to_2nd != -1): \n", | |
" # dp_indicator = 1 \n", | |
" first_receiver = 0\n", | |
" for i in range(9):\n", | |
" if(min(pos_catch_array) == pos_array[i]):\n", | |
" first_receiver = i\n", | |
"\n", | |
" if(on_1b == 1 and out_when_up < 2 ):\n", | |
"\n", | |
" for i in range(9):\n", | |
" if(min(pos_catch_array) == pos_array[i]):\n", | |
"\n", | |
" pos_id_dp.append(statcast_data['pos'+ str(i+1) + '_person_id'][count])\n", | |
"\n", | |
" second_min = heapq.nsmallest(2, pos_catch_array)[-1] \n", | |
" if(len(pos_catch_array) >= 2 and (second_min == pos_1b or second_min == pos_2b or second_min == pos_ss or second_min == pos_3b)):\n", | |
"\n", | |
" for i in range(9):\n", | |
" if(second_min == pos_array[i]):\n", | |
" pos_id_dp_pivot.append(statcast_data['pos'+ str(i+1) + '_person_id'][count])\n", | |
"\n", | |
"\n", | |
" else:\n", | |
" pos_id_dp_pivot.append(0)\n", | |
"\n", | |
"\n", | |
" train_parameter_dp.append([launch_speed_data, launch_angle_data, hc_x, hc_y, altitude, handness, shift, first_receiver])\n", | |
"\n", | |
"\n", | |
"\n", | |
" # double_play = item.find('double play')\n", | |
" if(statcast_data['events'][count] == 'grounded_into_double_play' ):\n", | |
" dp_result.append(1)\n", | |
" else:\n", | |
" dp_result.append(0)\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
" flies_out = item.find('flies out')\n", | |
" lines_out = item.find('lines out')\n", | |
"\n", | |
"\n", | |
"\n", | |
" if(min(pos_catch_array) == pos_lf or min(pos_catch_array) == pos_cf or min(pos_catch_array) == pos_rf):\n", | |
"\n", | |
" if((events == 'single' and (on_1b == 1 or on_2b == 1) )or (events == 'double' and on_1b == 1) or ((flies_out != -1 or lines_out != -1 )and (on_2b == 1 or on_3b == 1) and out_when_up < 2 )):\n", | |
" \n", | |
" if(min(pos_catch_array) == pos_lf):\n", | |
" first_receiver_id = (statcast_data['pos7_person_id'][count])\n", | |
" elif(min(pos_catch_array) == pos_cf):\n", | |
" first_receiver_id =(statcast_data['pos8_person_id'][count])\n", | |
" else:\n", | |
" first_receiver_id =(statcast_data['pos9_person_id'][count])\n", | |
" \n", | |
" pos_id_arm.append(first_receiver_id)\n", | |
" \n", | |
" \n", | |
" if(events == 'single' and on_2b == 1): \n", | |
"\n", | |
" extra_base_case = 0\n", | |
" extra_base_case_count.append(extra_base_case)\n", | |
" \n", | |
" if(item.find('out at home') != -1):\n", | |
" kill_id.append(first_receiver_id)\n", | |
" \n", | |
" if(scores != -1 or out_at_home != -1):\n", | |
"\n", | |
" arm_result.append(1)\n", | |
" else: \n", | |
" arm_result.append(0) \n", | |
"\n", | |
" elif(events == 'single' and on_1b == 1): \n", | |
"\n", | |
" extra_base_case = 1\n", | |
" extra_base_case_count.append(extra_base_case)\n", | |
" \n", | |
" if(item.find('out at third') != -1):\n", | |
" kill_id.append(first_receiver_id) \n", | |
"\n", | |
" if(to_2nd != -1 or out_at_2nd != -1):\n", | |
"\n", | |
" arm_result.append(0)\n", | |
" else: \n", | |
" arm_result.append(1)\n", | |
"\n", | |
" elif(events == 'double' and on_1b == 1):\n", | |
"\n", | |
" extra_base_case = 2\n", | |
" extra_base_case_count.append(extra_base_case)\n", | |
" \n", | |
" if(item.find('out at third') != -1):\n", | |
" kill_id.append(first_receiver_id) \n", | |
" \n", | |
" if(scores != -1 or out_at_home != -1):\n", | |
"\n", | |
" arm_result.append(1)\n", | |
" else: \n", | |
" arm_result.append(0) \n", | |
"\n", | |
" elif((flies_out != -1 or lines_out != -1 )and (on_3b == 1) and statcast_data['outs_when_up'][count] < 2):\n", | |
"\n", | |
" extra_base_case = 3\n", | |
" extra_base_case_count.append(extra_base_case)\n", | |
" \n", | |
" if(item.find('out at home') != -1):\n", | |
" kill_id.append(first_receiver_id) \n", | |
" \n", | |
" if(scores != -1 or out_at_home != -1):\n", | |
"\n", | |
" arm_result.append(1)\n", | |
" else: \n", | |
" arm_result.append(0) \n", | |
" else:\n", | |
" extra_base_case = 4\n", | |
" extra_base_case_count.append(extra_base_case)\n", | |
" \n", | |
" if(item.find('out at third') != -1):\n", | |
" kill_id.append(first_receiver_id) \n", | |
" \n", | |
" if(to_3rd != -1 or out_at_3rd != -1):\n", | |
"\n", | |
" arm_result.append(1)\n", | |
" else: \n", | |
" arm_result.append(0) \n", | |
"\n", | |
" train_parameter_arm.append([launch_speed_data, launch_angle_data, hc_x, hc_y, park, handness, shift, game_day, extra_base_case, first_receiver,out_when_up])\n", | |
"\n", | |
"\n", | |
"\n", | |
" print(len(catch_result)) \n", | |
" \n", | |
" return pos_id_range,pos_id_dp,pos_id_dp_pivot,pos_id_arm,catch_result,train_parameter_range ,dp_result,train_parameter_dp,arm_result,train_parameter_arm,extra_base_case_count,kill_id\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Get statcast data\n", | |
"\n", | |
"conn = psycopg2.connect(\"dbname=dbname user=user host=host password=password\")\n", | |
"cur = conn.cursor()\n", | |
"engine = sqlalchemy.create_engine('dialect+driver://username:password@host:port/database', echo=False)\n", | |
"\n", | |
"\n", | |
"\n", | |
"data_April_18 = pybaseball.statcast(start_dt='2018-04-03', end_dt='2018-05-02')\n", | |
"data_May_18 = pybaseball.statcast(start_dt='2018-05-03', end_dt='2018-06-02')\n", | |
"data_June_18 = pybaseball.statcast(start_dt='2018-06-03', end_dt='2018-07-02')\n", | |
"\n", | |
"\n", | |
"\n", | |
"data_April_18.to_sql('baseball_2018', con=engine, if_exists='replace')\n", | |
"data_May_18.to_sql('baseball_2018', con=engine, if_exists='append')\n", | |
"data_June_18.to_sql('baseball_2018', con=engine, if_exists='append')\n", | |
"\n", | |
"data_April_17 = pybaseball.statcast(start_dt='2017-04-03', end_dt='2017-05-02')\n", | |
"data_May_17 = pybaseball.statcast(start_dt='2017-05-03', end_dt='2017-06-02')\n", | |
"data_June_17 = pybaseball.statcast(start_dt='2017-06-03', end_dt='2017-07-02')\n", | |
"data_July_17 = pybaseball.statcast(start_dt='2017-07-03', end_dt='2017-08-02')\n", | |
"data_August_17 = pybaseball.statcast(start_dt='2017-08-03', end_dt='2017-09-02')\n", | |
"data_September_17 = pybaseball.statcast(start_dt='2017-09-03', end_dt='2017-10-02')\n", | |
"\n", | |
"\n", | |
"data_April.to_sql('baseball_2018', con=engine, if_exists='replace')\n", | |
"data_May.to_sql('baseball_2018', con=engine, if_exists='append')\n", | |
"data_June.to_sql('baseball_2018', con=engine, if_exists='append')\n", | |
"\n", | |
"data_April.to_sql('baseball_2017', con=engine, if_exists='replace')\n", | |
"data_May.to_sql('baseball_2017', con=engine, if_exists='append')\n", | |
"data_June.to_sql('baseball_2017', con=engine, if_exists='append')\n", | |
"data_July.to_sql('baseball_2017', con=engine, if_exists='append')\n", | |
"data_August.to_sql('baseball_2017', con=engine, if_exists='append')\n", | |
"data_September.to_sql('baseball_2017', con=engine, if_exists='append')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Train with 2017 Statcast data\n", | |
"\n", | |
"query = '''\n", | |
"SELECT des,game_date,bb_type,home_team,events,description,launch_speed, launch_angle, on_3b, on_2b, on_1b,stand,\n", | |
"zone,outs_when_up,hc_x,hc_y,hit_location,\n", | |
"pos1_person_id, pos2_person_id, pos3_person_id, \n", | |
"pos4_person_id, pos5_person_id, pos6_person_id,\n", | |
"pos7_person_id, pos8_person_id, pos9_person_id, \n", | |
"if_fielding_alignment, of_fielding_alignment\n", | |
"\n", | |
"FROM baseball_2017\n", | |
"WHERE launch_speed >= 0 AND description != 'foul' AND pos1_person_id >= 0\n", | |
"AND events != 'walk' AND events != 'hit_by_pitch' AND hc_x >1 AND hc_y > 1 \n", | |
"ORDER BY game_date\n", | |
"'''\n", | |
"\n", | |
"\n", | |
"statcast_2017_data = pd.read_sql(query,engine) \n", | |
" \n", | |
"pos_id_range,pos_id_dp,pos_id_dp_pivot,pos_id_arm,catch_result,train_parameter_range ,dp_result,train_parameter_dp,arm_result,train_parameter_arm,extra_base_case_count,kill_id = train_data(statcast_2017_data) \n", | |
" \n", | |
"\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Plus-minus run\n", | |
" \n", | |
"label = ['launch speed', 'exit angle', 'theta' ,'altitude','handness','shift','date', 'on_1b', 'on_2b', 'on_3b','out'] \n", | |
"\n", | |
"\n", | |
"df = pd.DataFrame.from_records(np.array(train_parameter_range).tolist(),columns = label) \n", | |
"X_train, X_test, y_train, y_test = train_test_split(df[label], catch_result, test_size=0.5, stratify=catch_result, random_state=123456) \n", | |
"rf_catch = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n", | |
"rf_catch.fit(X_train, y_train)\n", | |
"\n", | |
"joblib.dump(rf_catch, 'rf_catch_17.pkl')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Double play run\n", | |
"\n", | |
"label = ['launch speed', 'exit angle', 'hc_x' , 'hc_y', 'altitude','handness','shift','first_receiver'] \n", | |
"\n", | |
" \n", | |
"df2 = pd.DataFrame.from_records(np.array(train_parameter_dp).tolist(),columns = label) \n", | |
"X_train, X_test, y_train, y_test = train_test_split(df2[label], dp_result, test_size=0.3, stratify=dp_result, random_state=123456)\n", | |
"rf_dp = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n", | |
"rf_dp.fit(X_train, y_train)\n", | |
"joblib.dump(rf_dp, 'rf_dp_17.pkl')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Arm run\n", | |
"\n", | |
"label = ['launch speed', 'exit angle', 'hc_x' , 'hc_y', 'altitude','handness','shift','date', 'condition','first_receiver'] \n", | |
"\n", | |
"\n", | |
"df3 = pd.DataFrame.from_records(np.array(train_parameter_arm).tolist(),columns = label) \n", | |
"\n", | |
"X_train, X_test, y_train, y_test = train_test_split(df3[label], arm_result, test_size=0.3, stratify=arm_result, random_state=123456)\n", | |
"\n", | |
"rf_arm = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n", | |
"rf_arm.fit(X_train, y_train)\n", | |
"joblib.dump(rf_arm, 'rf_arm_17.pkl')\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Framing run\n", | |
"\n", | |
"query = '''\n", | |
"SELECT release_speed, release_pos_x, release_pos_z,\n", | |
"description, stand, p_throws, pfx_x, pfx_z, plate_x, plate_z,sz_top,sz_bot,vx0,vz0,\n", | |
"effective_speed,release_spin_rate,release_extension,zone,\n", | |
"pitcher,pos2_person_id,\n", | |
"ax,ay,az,pitch_name,home_team,balls,strikes,inning_topbot\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"FROM baseball_2017\n", | |
"WHERE description = 'ball' OR description = 'called_strike' \n", | |
"AND release_speed >= 0 AND plate_x <100 AND plate_z <100 \n", | |
"'''\n", | |
"statcast_2017_pitch_data = pd.read_sql(query,engine)\n", | |
"#statcast_2017_data\n", | |
"\n", | |
"statcast_pitch_data = statcast_2017_pitch_data.dropna(axis=0, how='any')\n", | |
"\n", | |
"train_pitch_parameter = pd.get_dummies(statcast_pitch_data[['plate_x','plate_z', 'stand' , 'p_throws', 'release_extension','release_speed','release_spin_rate' ,'release_pos_x','release_pos_z','vx0','vz0']]) \n", | |
"\n", | |
"pitch_result = []\n", | |
"\n", | |
"for description in statcast_pitch_data['description']:\n", | |
" if(description == 'ball'):\n", | |
" pitch_result.append(0)\n", | |
" else:\n", | |
" pitch_result.append(1)\n", | |
" \n", | |
"X_train, X_test, y_train, y_test = train_test_split(train_pitch_parameter, pitch_result, test_size=0.3, stratify=pitch_result, random_state=123456)\n", | |
"rf_pitch = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n", | |
"rf_pitch.fit(X_train, y_train)\n", | |
"joblib.dump(rf_pitch, 'rf_pitch_17.pkl')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Calculate DRS using 18 data\n", | |
"\n", | |
"query = '''\n", | |
"SELECT des,game_date,bb_type,home_team,events,description,launch_speed, launch_angle, on_3b, on_2b, on_1b,stand,\n", | |
"zone,outs_when_up,hc_x,hc_y,hit_location,\n", | |
"pos1_person_id, pos2_person_id, pos3_person_id, \n", | |
"pos4_person_id, pos5_person_id, pos6_person_id,\n", | |
"pos7_person_id, pos8_person_id, pos9_person_id, \n", | |
"if_fielding_alignment, of_fielding_alignment\n", | |
"\n", | |
"FROM baseball_2018\n", | |
"WHERE launch_speed >= 0 AND description != 'foul' AND pos1_person_id >= 0\n", | |
"AND events != 'walk' AND events != 'hit_by_pitch' AND hc_x >1 AND hc_y > 1 \n", | |
"ORDER BY game_date \n", | |
"'''\n", | |
"statcast_2018_data = pd.read_sql(query,engine)\n", | |
"\n", | |
"pos_id_range,pos_id_dp,pos_id_dp_pivot,pos_id_arm,catch_result,train_parameter_range ,dp_result,train_parameter_dp,arm_result,train_parameter_arm,extra_base_case_count,kill_id = train_data(statcast_2018_data) \n", | |
"\n", | |
"\n", | |
"rf_catch = joblib.load('rf_catch_17.pkl')\n", | |
"rf_dp = joblib.load('rf_dp_17.pkl')\n", | |
"rf_arm = joblib.load('rf_arm_17.pkl')\n", | |
"\n", | |
"\n", | |
"predict_proba_catch = rf_catch.predict_proba(train_parameter_range)\n", | |
"predict_proba_dp = rf_dp.predict_proba(train_parameter_dp)\n", | |
"predict_proba_arm = rf_arm.predict_proba(train_parameter_arm)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Range Run/plus-minus run\n", | |
"\n", | |
"label = ['P', 'C', '1B' ,'2B','3B','SS','LF', 'CF', 'RF'] \n", | |
"\n", | |
"playerid_df = pd.DataFrame.from_records(pos_id_range,columns = label) \n", | |
"\n", | |
"\n", | |
"unique_id_range = []\n", | |
"\n", | |
"for i, position in enumerate(label):\n", | |
"\n", | |
" unique_id_range.append(playerid_df[position].unique())\n", | |
"\n", | |
"\n", | |
"range_run = []\n", | |
"range_count = []\n", | |
"for i in range(9):\n", | |
"\n", | |
" range_run.append(np.zeros(len(unique_id_range[i])))\n", | |
" range_count.append(np.zeros(len(unique_id_range[i])))\n", | |
"range_run = np.array(range_run)\n", | |
"range_run_count = np.array(range_count)\n", | |
"\n", | |
"\n", | |
"for i, result in enumerate(catch_result):\n", | |
" \n", | |
" if(result > 0):\n", | |
" for j,player_id in enumerate(unique_id_range[int(result-1)]):\n", | |
" if(player_id == pos_id_range[i][int(result-1)]):\n", | |
" range_run[int(result-1)][j] += 0.83*predict_proba_catch[i][0]\n", | |
" range_count[int(result-1)][j] +=1\n", | |
" else:\n", | |
" for k in range(9):\n", | |
" for j,player_id in enumerate(unique_id_range[k]): \n", | |
" if(player_id == pos_id_range[i][k]):\n", | |
" range_run[k][j] -= 0.83*predict_proba_catch[i][k+1]\n", | |
" range_count[k][j] +=1\n", | |
"\n", | |
"\n", | |
"for i in range(9):\n", | |
" range_run_sum = np.sum(range_run[i])\n", | |
" range_run[i] -= 0.5*np.sum(range_run_sum)*range_count[i]/np.sum(range_count[i])\n", | |
" \n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Double play run\n", | |
"\n", | |
"unique_dp = np.unique(pos_id_dp)\n", | |
"unique_dp_pivot = np.unique(pos_id_dp_pivot)\n", | |
"dp_run = np.zeros(len(unique_dp))\n", | |
"dp_pivot = np.zeros(len(unique_dp_pivot))\n", | |
"dp_count = np.zeros(len(unique_dp))\n", | |
"dp_pivot_count = np.zeros(len(unique_dp_pivot))\n", | |
"\n", | |
"for i, result in enumerate(dp_result):\n", | |
"\n", | |
" for j,player_id in enumerate(unique_dp):\n", | |
"\n", | |
" if(player_id == pos_id_dp[i]):\n", | |
" if(pos_id_dp_pivot[i] == 0):\n", | |
" dp_count[j] += 1\n", | |
" if(result == 1):\n", | |
"\n", | |
" dp_run[j] += 0.58*predict_proba_dp[i][0]\n", | |
" else:\n", | |
" dp_run[j] -= 0.58*predict_proba_dp[i][1]\n", | |
" \n", | |
" else: \n", | |
" for k,player_id_pivot in enumerate(unique_dp_pivot):\n", | |
" if(player_id_pivot== pos_id_dp_pivot[i]):\n", | |
" dp_count[j] += 1\n", | |
" dp_pivot_count[k] += 1\n", | |
" if(result == 1):\n", | |
"\n", | |
" dp_run[j] += 0.5*0.58*predict_proba_dp[i][0]\n", | |
" dp_pivot[k] += 0.5*0.58*predict_proba_dp[i][0]\n", | |
" else:\n", | |
" dp_run[j] -= 0.5*0.58*predict_proba_dp[i][1]\n", | |
" dp_pivot[k] -= 0.5*0.58*predict_proba_dp[i][1]\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Arm run\n", | |
"\n", | |
"unique_arm = np.unique(pos_id_arm)\n", | |
"\n", | |
"arm_run = np.zeros(len(unique_arm))\n", | |
"\n", | |
"arm_count = np.zeros(len(unique_arm))\n", | |
"\n", | |
"\n", | |
"for j,player_id in enumerate(unique_arm): \n", | |
" for i, result in enumerate(arm_result):\n", | |
" \n", | |
" if(player_id == pos_id_arm[i]):\n", | |
" arm_count[j] += 1\n", | |
" if(result == 1):\n", | |
" arm_run[j] -= 0.25* predict_proba_arm[i][0]\n", | |
" \n", | |
" else:\n", | |
" arm_run[j] += 0.25*predict_proba_arm[i][1]\n", | |
" for kill_player_id in kill_id:\n", | |
" if(player_id == kill_player_id):\n", | |
" arm_run[j] += 0.75 \n", | |
"\n", | |
"arm_run_sum = np.sum(arm_run)\n", | |
"arm_run -= 0.5*np.sum(arm_run_sum)*arm_count/np.sum(arm_count)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Catch Framing run\n", | |
"\n", | |
"query = '''\n", | |
"SELECT release_speed, release_pos_x, release_pos_z,\n", | |
"description, stand, p_throws, pfx_x, pfx_z, plate_x, plate_z,sz_top,sz_bot,vx0,vz0,\n", | |
"effective_speed,release_spin_rate,release_extension,\n", | |
"pitcher,pos2_person_id,\n", | |
"ax,ay,az,pitch_name,home_team,balls,strikes,inning_topbot\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"FROM baseball_2018\n", | |
"WHERE description = 'ball' OR description = 'called_strike'\n", | |
"AND release_speed >= 0 AND plate_x <100 AND plate_z <100 \n", | |
"'''\n", | |
"statcast_2018_pitch_data = pd.read_sql(query,engine)\n", | |
"\n", | |
"statcast_pitch_data = statcast_2018_pitch_data.dropna(axis=0, how='any')\n", | |
"\n", | |
"train_pitch_parameter = pd.get_dummies(statcast_pitch_data[['plate_x','plate_z', 'stand' , 'p_throws', 'release_extension','release_speed','release_spin_rate' ,'release_pos_x','release_pos_z','vx0','vz0']]) \n", | |
"\n", | |
"\n", | |
"rf_pitch = joblib.load('rf_pitch_17.pkl')\n", | |
"\n", | |
"pitch_proba = rf_pitch.predict_proba(train_pitch_parameter)\n", | |
"\n", | |
"pitch_result = []\n", | |
"\n", | |
"for description in statcast_pitch_data['description']:\n", | |
" if(description == 'ball'):\n", | |
" pitch_result.append(0)\n", | |
" else:\n", | |
" pitch_result.append(1)\n", | |
" \n", | |
"unique_catcher_frame = pd.unique(statcast_pitch_data['pos2_person_id'])\n", | |
"catcher_id_all = np.array(statcast_pitch_data['pos2_person_id'])\n", | |
"\n", | |
"frame_run = np.zeros(len(unique_catcher_frame))\n", | |
"frame_count = np.zeros(len(unique_catcher_frame))\n", | |
"\n", | |
"for i,proba in enumerate(pitch_proba):\n", | |
" for j,player_id in enumerate(unique_catcher_frame):\n", | |
" \n", | |
" if(player_id == catcher_id_all[i] ):\n", | |
" frame_count[j] += 1\n", | |
" if(pitch_result[i] == 1):\n", | |
" frame_run[j] += 0.14*proba[0]\n", | |
" else:\n", | |
" frame_run[j] -= 0.14*proba[1]\n", | |
"\n", | |
"frame_run_sum = np.sum(frame_run)\n", | |
"frame_run -= 0.5*frame_run_sum / np.sum(frame_count)* frame_count " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Catch stolen base run\n", | |
"\n", | |
"f = urllib.request.urlopen(\"https://baseballsavant.mlb.com/poptime?year=2017&team=&min2b=1&min3b=0\")\n", | |
"soup = BeautifulSoup(f,\"lxml\")\n", | |
"\n", | |
"catcher_poptime_df = pd.DataFrame.from_dict(json.loads(soup.findAll(\"script\")[9].text.split('\\n')[1][15:-1]))\n", | |
"\n", | |
"catch_pop_id = catcher_poptime_df['player_id']\n", | |
"catch_pop_run = np.zeros(len(catch_pop_id))\n", | |
"for i in range(len(catch_pop_id)):\n", | |
"\n", | |
" catch_pop_run[i] = 1.2*(2.01-float(catcher_poptime_df['pop_2b_sba'][i]))*float(catcher_poptime_df['pop_2b_sba_count'][i])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Get player name from database\n", | |
"url=\"https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv\"\n", | |
"player_table=pd.read_csv(url)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Inning data and positional adjustment\n", | |
"\n", | |
"query = '''\n", | |
"SELECT \n", | |
"pos1_person_id, pos2_person_id, pos3_person_id, \n", | |
"pos4_person_id, pos5_person_id, pos6_person_id,\n", | |
"pos7_person_id, pos8_person_id, pos9_person_id,outs_when_up,inning,game_pk\n", | |
"\n", | |
"\n", | |
"\n", | |
"FROM baseball_2018\n", | |
"WHERE pos1_person_id IS NOT NULL\n", | |
"GROUP BY pos1_person_id, pos2_person_id, pos3_person_id, \n", | |
"pos4_person_id, pos5_person_id, pos6_person_id,\n", | |
"pos7_person_id, pos8_person_id, pos9_person_id,outs_when_up,inning,game_pk\n", | |
"'''\n", | |
"statcast_2018_data_all_inning = pd.read_sql(query,engine)\n", | |
"\n", | |
"statcast_data_all_inning = statcast_2018_data_all_inning\n", | |
"\n", | |
"statcast_data_all_inning_id = statcast_data_all_inning.iloc[:,0:9]\n", | |
"pos_id_all_inning = np.unique(statcast_data_all_inning_id.dropna(axis=0, how='any').values)\n", | |
"\n", | |
"positional_inning = np.zeros((len(pos_id_all_inning),9))\n", | |
"\n", | |
"for i,column in enumerate(statcast_data_all_inning.iloc[:,0:9]):\n", | |
"\n", | |
"\n", | |
" for k, player_id_match in enumerate(pos_id_all_inning):\n", | |
" positional_inning[k][i] += np.in1d(statcast_data_all_inning[column],player_id_match).sum()*1/3\n", | |
" \n", | |
"positional_inning_sum = np.sum(positional_inning,axis=1) \n", | |
"player_position = np.argmax(positional_inning, axis=1)\n", | |
"\n", | |
"positional_run_temp = np.zeros((len(pos_id_all_inning),9))\n", | |
"\n", | |
"position_run_adjustment = [0,12.5,-12.5,2.5,2.5,7.5,-7.5,2.5,-7.5]\n", | |
"\n", | |
"for i in range(9):\n", | |
"\n", | |
" positional_run_temp[:,i] = positional_inning[:,i]/9/162*position_run_adjustment[i]\n", | |
" \n", | |
"positional_run = np.sum(positional_run_temp,axis=1).tolist() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Adding all the above together\n", | |
"\n", | |
"unique_id_range_stack = np.hstack(unique_id_range)\n", | |
"range_run_stack = np.hstack(range_run)\n", | |
"\n", | |
"run_array = [range_run_stack,dp_run,dp_pivot,arm_run,catch_pop_run,frame_run,positional_run]\n", | |
"id_array = [unique_id_range_stack,unique_dp,unique_dp_pivot,unique_arm,np.array(catch_pop_id),unique_catcher_frame,pos_id_all_inning]\n", | |
"unique_id_all = np.unique(np.hstack(pos_id_all_inning))\n", | |
"\n", | |
"player_run = np.zeros((len(unique_id_all),7))\n", | |
"\n", | |
"for i,player_id in enumerate(unique_id_all):\n", | |
" for j in range(7):\n", | |
" for k,player_id_match in enumerate(id_array[j]):\n", | |
" if(player_id == player_id_match):\n", | |
" player_run[i][j] += run_array[j][k]\n", | |
"\n", | |
"player_total_run_df = pd.DataFrame(player_run,columns=['rPM','rDPc','rDPp','rARM','rSB','rFrame','rPos'])\n", | |
"player_total_run_df['rGDP'] = player_total_run_df[['rDPc','rDPp']].sum(axis=1)\n", | |
"player_total_run_df['player_id'] = unique_id_all\n", | |
"\n", | |
"player_name = []\n", | |
"\n", | |
"\n", | |
"for i in unique_id_all:\n", | |
"\n", | |
" player_name.append((player_table[player_table.key_mlbam == i]['name_first'].to_string(index=False) + ' ' + player_table[player_table.key_mlbam == i]['name_last'].to_string(index=False)))\n", | |
" \n", | |
"player_total_run_df['Player Name'] = player_name\n", | |
"\n", | |
"player_position_dict= {0:'P',1:'C',2:'1B',3:'2B',4:'3B',5:'SS',6:'LF',7:'CF',8:'RF'}\n", | |
"player_position_output = np.vectorize(player_position_dict.__getitem__)(player_position)\n", | |
"player_total_run_df['Position']=player_position_output\n", | |
"\n", | |
"player_total_run_df['Inning'] = positional_inning_sum \n", | |
"player_total_run_df['sDRS'] = player_total_run_df[['rPM','rGDP','rARM','rSB','rFrame']].sum(axis=1)\n", | |
"player_total_run_df['Def'] = player_total_run_df[['rPM','rGDP','rARM','rSB','rFrame','rPos']].sum(axis=1)\n", | |
"\n", | |
"player_total_run_df = player_total_run_df.loc[player_total_run_df['player_id'] != 0]\n", | |
"\n", | |
"player_total_run_df_out = player_total_run_df[['Player Name','Position','Inning','rPM','rGDP','rARM','rSB','rFrame','sDRS','rPos','Def']]\n", | |
"player_total_run_df_out = player_total_run_df_out.sort_values(by=['sDRS'],ascending=False)\n", | |
"player_total_run_df_out = player_total_run_df_out.round(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Output\n", | |
"\n", | |
"player_total_run_df_out.to_csv('StatcastDRS_18.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment