Mashimo · April 29, 2018 22:39
diff --git a/readNHL.py b/readNHL.py
 import pandas as pd

 # Load up the table for the years 2014-2015, and extract the dataset out of it. 
 #
 url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2"
 table_df = pd.read_html(url, header=1)[0]

 # Columns get automatic names. Rename the columns so that they are similar to the
 # column definitions on the website.
 #
 table_df.rename(columns={'G.1':'PP-G', 'A.1':'PP-A', 'G.2':'SH-G', 'A.2':'SH-A'},
                inplace=True)

 # Get rid of any row that has at least 4 NANs in it,
 # e.g. that do not contain player points statistics
 #
 table_df.dropna(axis=0, thresh = 4, inplace=True)

 # look through the dataset 
 #
 table_df = table_df[table_df.PLAYER != 'PLAYER']
    # first add the previous rank when is missing
 table_df.RK.fillna(method='ffill', inplace=True)


 # Get rid of the ranking 'RK' column
 #
 table_df.drop(labels=['RK'], axis=1, inplace=True)


 # Ensure there are no holes in the index by resetting it.
 #
 table_df.reset_index(drop=True, inplace=True)


 # Check the data type of all columns, and ensure those
 # that should be numeric are numeric
 #
 print(table_df.dtypes)
 table_df.GP = pd.to_numeric(table_df.GP, errors='coerce')
 table_df.G = pd.to_numeric(table_df.G, errors='coerce')
 table_df.A = pd.to_numeric(table_df.A, errors='coerce')
 table_df.PTS = pd.to_numeric(table_df.PTS, errors='coerce')
 table_df['+/-'] = pd.to_numeric(table_df['+/-'], errors='coerce')
 table_df.PIM = pd.to_numeric(table_df.PIM, errors='coerce')
 table_df['PTS/G'] = pd.to_numeric(table_df['PTS/G'], errors='coerce')
 table_df.SOG = pd.to_numeric(table_df.SOG, errors='coerce')
 table_df.PCT = pd.to_numeric(table_df.PCT, errors='coerce')
 table_df.GWG = pd.to_numeric(table_df.GWG, errors='coerce')
 table_df['PP-G'] = pd.to_numeric(table_df['PP-G'], errors='coerce')
 table_df['PP-A'] = pd.to_numeric(table_df['PP-A'], errors='coerce')
 table_df['SH-G'] = pd.to_numeric(table_df['SH-G'], errors='coerce')
 table_df['SH-A'] = pd.to_numeric(table_df['SH-A'], errors='coerce')
 print(table_df.dtypes)

 # Number of rows
 #
 table_df.shape

 # unique PCT values
 len(table_df.PCT.unique())
	import pandas as pd

	# Load up the table for the years 2014-2015, and extract the dataset out of it.
	#
	url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/2015/seasontype/2"
	table_df = pd.read_html(url, header=1)[0]

	# Columns get automatic names. Rename the columns so that they are similar to the
	# column definitions on the website.
	#
	table_df.rename(columns={'G.1':'PP-G', 'A.1':'PP-A', 'G.2':'SH-G', 'A.2':'SH-A'},
	inplace=True)

	# Get rid of any row that has at least 4 NANs in it,
	# e.g. that do not contain player points statistics
	#
	table_df.dropna(axis=0, thresh = 4, inplace=True)

	# look through the dataset
	#
	table_df = table_df[table_df.PLAYER != 'PLAYER']
	# first add the previous rank when is missing
	table_df.RK.fillna(method='ffill', inplace=True)


	# Get rid of the ranking 'RK' column
	#
	table_df.drop(labels=['RK'], axis=1, inplace=True)


	# Ensure there are no holes in the index by resetting it.
	#
	table_df.reset_index(drop=True, inplace=True)


	# Check the data type of all columns, and ensure those
	# that should be numeric are numeric
	#
	print(table_df.dtypes)
	table_df.GP = pd.to_numeric(table_df.GP, errors='coerce')
	table_df.G = pd.to_numeric(table_df.G, errors='coerce')
	table_df.A = pd.to_numeric(table_df.A, errors='coerce')
	table_df.PTS = pd.to_numeric(table_df.PTS, errors='coerce')
	table_df['+/-'] = pd.to_numeric(table_df['+/-'], errors='coerce')
	table_df.PIM = pd.to_numeric(table_df.PIM, errors='coerce')
	table_df['PTS/G'] = pd.to_numeric(table_df['PTS/G'], errors='coerce')
	table_df.SOG = pd.to_numeric(table_df.SOG, errors='coerce')
	table_df.PCT = pd.to_numeric(table_df.PCT, errors='coerce')
	table_df.GWG = pd.to_numeric(table_df.GWG, errors='coerce')
	table_df['PP-G'] = pd.to_numeric(table_df['PP-G'], errors='coerce')
	table_df['PP-A'] = pd.to_numeric(table_df['PP-A'], errors='coerce')
	table_df['SH-G'] = pd.to_numeric(table_df['SH-G'], errors='coerce')
	table_df['SH-A'] = pd.to_numeric(table_df['SH-A'], errors='coerce')
	print(table_df.dtypes)

	# Number of rows
	#
	table_df.shape

	# unique PCT values
	len(table_df.PCT.unique())