Last active
November 9, 2017 17:42
-
-
Save howardhamilton/87590b2e824d76d01f6d to your computer and use it in GitHub Desktop.
Identify N nearest neighbors to player's season statistics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
An outline for identifying N players with similar summary statistics to a player of interest. | |
A list of candidate players is compiled by filtering on position and end-of-season age. | |
Summary statistics are scaled to z-scores, which are the inputs to the machine learning model. | |
This algorithm uses K-Nearest Neighbor, but other algorithms (eg K-Means Clustering) can be substituted. | |
(c) 2015 Soccermetrics Research LLC | |
This code is licensed under the terms of the MIT License (http://choosealicense.com/licenses/mit/) | |
""" | |
from sklearn.neighbors import NearestNeighbors | |
from scipy.spatial.distance import cosine | |
import numpy as np | |
def get_statistical_fields(position): | |
""" | |
Compile list of statistical categories to be retrieved from database | |
given player position. | |
""" | |
# metrics consistent among all players | |
all_metric_list = ('games_started', 'games_subbed', 'minutes', 'yellows', 'reds') | |
# metrics for field players | |
field_metric_list = ('goals_total', 'goals_headed', 'goals_freekick', 'goals_in_area', | |
'goals_out_area', 'goals_penalty', 'penalties', 'winners', 'assists', | |
'deadball_assists', 'shots', 'fouls') | |
# metrics for goalkeepers | |
gk_metric_list = ('wins', 'draws', 'losses', 'goals_allowed', 'shutouts', 'shots_allowed') | |
if position != POSITION['Goalkeeper']: | |
stat_table = 'field_stats_list' | |
metrics = all_metric_list + field_metric_list | |
else: | |
stat_table = 'goalkeeper_stats_list' | |
metrics = all_metric_list + gk_metric_list | |
def identify_similar_player(player, competition, season, N=20): | |
""" | |
Identify N nearest neighbors to player's statistical performance | |
controlling for players of same age and position | |
""" | |
def mydist(x,y): | |
# internal function | |
# compute cosine distance between two points | |
return cosine(x,y) | |
# retrieve player position and age | |
player_position, player_age = get_position_and_age(player, season) | |
# get statistical fields that are relevant to player position | |
# field players and goalkeepers have statistical categories unique to them | |
stat_table, stat_metrics = get_statistical_fields(player_position) | |
# retrieve player z-scores for each metric --> player statistical record | |
# some metrics do not exist for player --> these are zeroed out | |
player_record = get_statistical_record(player, competition, season, metrics) | |
# retrieve candidate list of players who match position and age | |
# player, competition, season IDs | |
candidate_player_list = create_candidate_list(stat_table, player_position, player_age) | |
# create candidate player statistical record --> feature matrix | |
training_list = [ | |
get_statistical_record(candidate_player, candidate_competition, candidate_season, metrics) | |
for candidate_player, candidate_competition, candidate_season in candidate_player_list | |
] | |
# train a K-Nearest Neighbor model on candidate player statistical records | |
# N neighbors, BallTree algorithm, cosine similarity distance | |
# for more details see http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html | |
X = np.array(training_list) | |
nbrs = NearestNeighbors(n_neighbors=N, algorithm='ball_tree', metric='pyfunc', func=mydist).fit(X) | |
# find the K-neighbors to the player's statistical record | |
distances, indices = nbrs.kneighbors(player_record) | |
return distances, indices |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment