Last active
April 29, 2018 22:39
-
-
Save Mashimo/11dd439d1546319e60601474fb0da0e7 to your computer and use it in GitHub Desktop.
Read NHL Historic Player Points Statistics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Load up the table for the years 2014-2015, and extract the dataset out of it. | |
# | |
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2" | |
table_df = pd.read_html(url, header=1)[0] | |
# Columns get automatic names. Rename the columns so that they are similar to the | |
# column definitions on the website. | |
# | |
table_df.rename(columns={'G.1':'PP-G', 'A.1':'PP-A', 'G.2':'SH-G', 'A.2':'SH-A'}, | |
inplace=True) | |
# Get rid of any row that has at least 4 NANs in it, | |
# e.g. that do not contain player points statistics | |
# | |
table_df.dropna(axis=0, thresh = 4, inplace=True) | |
# look through the dataset | |
# | |
table_df = table_df[table_df.PLAYER != 'PLAYER'] | |
# first add the previous rank when is missing | |
table_df.RK.fillna(method='ffill', inplace=True) | |
# Get rid of the ranking 'RK' column | |
# | |
table_df.drop(labels=['RK'], axis=1, inplace=True) | |
# Ensure there are no holes in the index by resetting it. | |
# | |
table_df.reset_index(drop=True, inplace=True) | |
# Check the data type of all columns, and ensure those | |
# that should be numeric are numeric | |
# | |
print(table_df.dtypes) | |
table_df.GP = pd.to_numeric(table_df.GP, errors='coerce') | |
table_df.G = pd.to_numeric(table_df.G, errors='coerce') | |
table_df.A = pd.to_numeric(table_df.A, errors='coerce') | |
table_df.PTS = pd.to_numeric(table_df.PTS, errors='coerce') | |
table_df['+/-'] = pd.to_numeric(table_df['+/-'], errors='coerce') | |
table_df.PIM = pd.to_numeric(table_df.PIM, errors='coerce') | |
table_df['PTS/G'] = pd.to_numeric(table_df['PTS/G'], errors='coerce') | |
table_df.SOG = pd.to_numeric(table_df.SOG, errors='coerce') | |
table_df.PCT = pd.to_numeric(table_df.PCT, errors='coerce') | |
table_df.GWG = pd.to_numeric(table_df.GWG, errors='coerce') | |
table_df['PP-G'] = pd.to_numeric(table_df['PP-G'], errors='coerce') | |
table_df['PP-A'] = pd.to_numeric(table_df['PP-A'], errors='coerce') | |
table_df['SH-G'] = pd.to_numeric(table_df['SH-G'], errors='coerce') | |
table_df['SH-A'] = pd.to_numeric(table_df['SH-A'], errors='coerce') | |
print(table_df.dtypes) | |
# Number of rows | |
# | |
table_df.shape | |
# unique PCT values | |
len(table_df.PCT.unique()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment