Last active
February 25, 2017 21:10
-
-
Save linwoodc3/55d1cf413089560f157030b94746844f to your computer and use it in GitHub Desktop.
Utility functions to clean ESPN.com's NBA data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# By: Linwood Creekmore | |
# https://www.linkedin.com/in/linwood-creekmore-iii-a2174538/ | |
from dateutil.parser import parse | |
from concurrent.futures import ProcessPoolExecutor | |
from functools import partial | |
import datetime | |
import re | |
import numpy as np | |
import pandas as pd | |
import pysal as ps | |
def datecheck(row,year): | |
"""Function to convert different datetime columns into datetime object | |
Uses basic rule to create 2016 and 2017 datetime objects based on knowledge | |
Parameters | |
------------ | |
row : pandas row | |
row of pandas dataframe to perform the operations | |
year: int | |
Integer representing the desired year | |
Returns | |
------------ | |
Series ; pandas series | |
Series with datetime objects | |
""" | |
if datetime.datetime.now().year == parse(str(year)).year: | |
year1=year-1 | |
year2=year | |
else: | |
year1 = year-1 | |
year2=year | |
try: | |
date = str(row['Date']+' {0} '.format(year1)+ row['Time']) | |
try: | |
converted = parse(row['Date']+' {0} '.format(year1)+ row['Time']) | |
except: | |
try: | |
converted = parse(row['Date']+' {0} '.format(year2)+ row['Time']) | |
except: | |
pass | |
if converted < parse('Oct 01 {0}'.format(year1)): | |
try: | |
return parse(row['Date'] + " {0} ".format(year2)+ row['Time']) | |
except: | |
print(row) | |
else: | |
return converted | |
except: | |
print(date) | |
pass | |
def tierclean(tier): | |
""" Reverses the tier scoring; makes 5 equal to 1 | |
""" | |
if tier == 5: | |
return 1 | |
if tier == 4: | |
return 2 | |
if tier==3: | |
return 3 | |
if tier == 2: | |
return 4 | |
if tier == 1: | |
return 5 | |
def singlename(p): | |
"""Extracts single team city name from full team | |
""" | |
test = p.split() | |
if "Portland" in test or len(test)==2: | |
return test[0] | |
else: | |
return " ".join(test[:2]) | |
def stringreplace(string): | |
"""Get rid of the 'at' in the schedule""" | |
return re.sub('at ',"",string).strip() | |
def schedulecleaner(year,team): | |
"""Functions to clean the ESPN schedule of NBA teams | |
and output as pandas dataframe | |
Parameters | |
----------- | |
year : int | |
Integer representing the year of the desired schedule | |
team : str | |
Trigram representing the three letter abbreviation of the NBA team | |
Returns | |
----------- | |
dataframe : pandas dataframe | |
Dataframe with data for NBA schedule of desired team | |
and desired year | |
""" | |
base = "http://www.espn.com/nba/teams/printSchedule?team={0}&season={1}".\ | |
format(team,str(year)) | |
schedule = pd.read_html(base) | |
schedule = schedule[0].iloc[2:,:3].reset_index(drop=True) | |
schedule.columns = ['Date','Team','Time'] | |
schedule = schedule[schedule['Date']!="Date"] | |
dater = partial(datecheck,year=year) | |
schedule = schedule[['Team']].assign(Date=schedule.apply(dater,axis=1),\ | |
home_status=np.where(schedule.Team.str.contains('at '),"A","H"),\ | |
Team=schedule.Team.apply(stringreplace)) | |
return schedule | |
def checker(team,frame): | |
"""Used to normalize the chance of home games.""" | |
normalizer=frame.sum().max() | |
# return (np.all(teams[team].values==2)) | |
return ((np.sum(frame[team].values)/normalizer)) | |
def standingscleaner(year): | |
"""Function to pull and clean NBA standings data from ESPN.""" | |
standings = pd.read_html('http://www.espn.com/nba/standings/_/season/{0}'.format(year)) | |
# create columns for eastern/western conference teams | |
eastern = standings[0].assign(conference='Eastern') | |
western = standings[1].assign(conference='Western') | |
# concatenate the different conference tables | |
nba_stand_df = (pd.concat([eastern,western]).reset_index(drop=True)).assign(seasonYear=year) | |
# clean text column with team name and abbreviation | |
# regex to clean text in standings columns | |
r = re.compile('[A-Z]{2,}') | |
s = re.compile('[A-Z]{1,3}.*[ ]?([A-Z]{1})?([0-9]+)?[a-z]+') | |
""" | |
big cleaning job here. We use the regex above to create new columns | |
with the Team's name spelled out, the 3 letter all-caps abbreviation, | |
the city name, and finally the team's tier based on it's win/loss pct. | |
column. | |
""" | |
clean_nba_stand = nba_stand_df.iloc[:,1:].assign(\ | |
teamName=nba_stand_df.iloc[:,0]\ | |
.apply(lambda x: s.search(x).group() if\ | |
s.search(x) else np.nan ),\ | |
team_abbrev = nba_stand_df.iloc[:,0]\ | |
.apply(lambda x: r.search(x).group() if \ | |
r.search(x) else np.nan),\ | |
cityName=nba_stand_df.iloc[:,0]\ | |
.apply(lambda x: s.search(x).group() if\ | |
s.search(x) else np.nan ).apply(singlename),\ | |
Tier=pd.Series((ps.Fisher_Jenks(nba_stand_df.PCT,k=5)).yb+1)\ | |
.apply(tierclean)) | |
# Now we reorder the columns | |
clean_nba_stand = clean_nba_stand[clean_nba_stand.columns[-2:].tolist()+\ | |
clean_nba_stand.columns[-6:-2].tolist()+\ | |
clean_nba_stand.columns[:-6].tolist()] | |
replacements = { | |
'teamName': { | |
r'(Seattle SuperSonics)': 'Oklahoma City Thunder', | |
r'New Jersey Nets': 'Brooklyn Nets', | |
r'Los Angeles Clippers':'LA Clippers', | |
r'Charlotte Bobcats':'Charlotte Hornets', | |
r'New Orleans Hornets':'New Orleans Pelicans', | |
r'NO/Oklahoma City Hornets':'New Orleans Pelicans'}, | |
'team_abbrev':{ | |
r'SEA':'OKC', | |
r'NJ':'BKN', | |
r'LAC':'LA' | |
}, | |
'cityName':{ | |
r'Seattle':'Oklahoma City', | |
r'New Jersey':'Brooklyn', | |
r'NO/Oklahoma City':'New Orleans', | |
r'Oklahoma$':'Oklahoma City', | |
} | |
} | |
clean_nba_stand.replace(replacements, regex=True,inplace=True) | |
clean_nba_stand.ix[clean_nba_stand.team_abbrev == 'LA', 'cityName'] = 'LA' | |
return clean_nba_stand | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment