Created
August 12, 2016 01:31
-
-
Save mrphilroth/aaf4bc875652979e592ccbaf0f3e954c to your computer and use it in GitHub Desktop.
Scrape basketball data and process the play by play data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import sys | |
import time | |
import redis | |
import requests | |
import argparse | |
import datetime | |
import numpy as np | |
from bs4 import BeautifulSoup | |
years = list(range(2010, 2017)) | |
rclient = redis.Redis(host="localhost") | |
# The number of seconds of game time at the start of each period | |
period_starts = { | |
1: 0, 2: 720, 3: 1440, 4: 2160, | |
5: 2880, 6: 3180, 7: 3480, 8: 3780, 9: 4080, 10: 4380 | |
} | |
ddir = os.path.join(os.environ["HOME"], "data") | |
sportdict = { | |
"baseball": { | |
"dir": os.path.abspath(ddir + "/baseball"), | |
"url": "http://www.baseball-reference.com" | |
}, | |
"basketball": { | |
"dir": os.path.abspath(ddir + "/basketball"), | |
"url": "http://www.basketball-reference.com" | |
}, | |
"football": { | |
"dir": os.path.abspath(ddir + "/football"), | |
"url": "http://www.pro-football-reference.com" | |
} | |
} | |
def get_page(sport, ext, filename=None, force=False): | |
""" | |
Mirror a directory structure found at a base url in a local directory | |
""" | |
if filename is None: | |
filename = "{}/{}".format(sportdict[sport]["dir"], ext) | |
if not force and os.path.exists(filename): | |
return open(filename).read() | |
filedir = os.path.os.path.dirname(filename) | |
if not os.path.exists(filedir): | |
os.makedirs(filedir) | |
url = "{}/{}".format(sportdict[sport]["url"], ext) | |
print(url) | |
currdelay = 1 | |
r = requests.get(url) | |
while r.status_code != 200: | |
time.sleep(currdelay) | |
r = requests.get(url) | |
currdelay += 1 | |
html = r.text | |
open(filename, "w").write(html) | |
return html | |
def remove_page(sport, ext, filename=None): | |
""" | |
Remove a page that may have already been downloaded | |
""" | |
if filename is None: | |
filename = "{}/{}".format(sportdict[sport]["dir"], ext) | |
if os.path.exists(filename): | |
os.remove(filename) | |
def day_from_gamecode(gamecode): | |
""" | |
Get a day index so that games are added to redis sorted | |
""" | |
year = int(gamecode[:4]) | |
month = int(gamecode[4:6]) | |
day = int(gamecode[6:8]) | |
return float(int(datetime.datetime(year, month, day).timestamp() / 86400)) | |
def get_players(row): | |
""" | |
Return all the players referenced in a table row in tuples: (id, name) | |
""" | |
return [(a.get("href")[11:-5], a.text) for a in row.findAll("a")] | |
def get_gameclock(period, minute_str): | |
""" | |
Return the seconds elapsed so far given a period and clock state | |
""" | |
minutes, seconds = [float(s) for s in minute_str.split(":")] | |
if period > 4: | |
period_clock = 5 * 60 - (60 * minutes + seconds) | |
else: | |
period_clock = 12 * 60 - (60 * minutes + seconds) | |
return period_starts[period] + period_clock | |
def get_right_digits(s): | |
""" | |
Pick off characters from the right while they're integers. Used to grab | |
the scores. | |
""" | |
i = len(s) - 1 | |
while s[i].isdigit(): | |
i -= 1 | |
return int(s[i + 1:]) | |
month_map = { | |
"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr", | |
"05": "May", "06": "Jun", "07": "Jul", "08": "Aug", | |
"09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec" | |
} | |
def short_date(gamecode): | |
""" | |
Convert a gamecode into a descriptive date | |
""" | |
return month_map[gamecode[4:6]] + " " + gamecode[6:8] | |
def get_season(gamecode): | |
""" | |
Get the year of the season from the gamecode | |
""" | |
year = int(gamecode[:4]) | |
month = int(gamecode[4:6]) | |
if month > 7: | |
year += 1 | |
return year | |
def process_boxscore(gamecode): | |
""" | |
Return a dict of the players that played in a game and their team | |
""" | |
pipe = rclient.pipeline() | |
pipe.zadd("gamecodes", gamecode, day_from_gamecode(gamecode)) | |
url = "boxscores/{}.html".format(gamecode) | |
html = get_page("basketball", url) | |
soup = BeautifulSoup(html, "lxml") | |
span = soup.find("span", {"class": "bold_text large_text"}) | |
header_table = list(span.parents)[5] | |
links = header_table.findAll("a") | |
team_links = [link for link in links if str(link).find("/teams/") > 0] | |
spans = header_table.findAll("span") | |
team_spans = [span for span in spans if str(span).find("/teams/") > 0] | |
away_team = team_links[0].get("href")[7:-5].replace("/", "") | |
away_score = get_right_digits(team_spans[0].text) | |
away_team_name = team_links[0].text | |
home_team = team_links[1].get("href")[7:-5].replace("/", "") | |
home_score = get_right_digits(team_spans[1].text) | |
home_team_name = team_links[1].text | |
pipe.hset(":".join([away_team, "info"]), "name", away_team_name) | |
pipe.hset(":".join([away_team, "info"]), "teamcode", away_team) | |
pipe.hset(":".join([home_team, "info"]), "name", home_team_name) | |
pipe.hset(":".join([home_team, "info"]), "teamcode", home_team) | |
pipe.hset(":".join([gamecode, "info"]), "gamecode", gamecode) | |
pipe.hset(":".join([gamecode, "info"]), "shortdate", short_date(gamecode)) | |
pipe.hset(":".join([gamecode, "info"]), "hometeam", home_team) | |
pipe.hset(":".join([gamecode, "info"]), "hometeamname", home_team_name) | |
pipe.hset(":".join([gamecode, "info"]), "homescore", home_score) | |
pipe.hset(":".join([gamecode, "info"]), "awayteam", away_team) | |
pipe.hset(":".join([gamecode, "info"]), "awayteamname", away_team_name) | |
pipe.hset(":".join([gamecode, "info"]), "awayscore", away_score) | |
team_dict = {} | |
year = get_season(gamecode) | |
tables = soup.findAll("table", {"class": "sortable stats_table"}) | |
for i, row in enumerate(tables[0].findAll("tr")): | |
for ptuple in get_players(row): | |
if len(row.contents) > 30: | |
pid = ptuple[0] | |
pname = ptuple[1] | |
team_dict[pid] = away_team | |
pipe.sadd(":".join([gamecode, away_team]), pid) | |
pipe.sadd("players", pid) | |
pipe.sadd(":".join([pid, str(year), "teams"]), away_team) | |
pipe.zadd(":".join([pid, "gamecodes"]), | |
gamecode, day_from_gamecode(gamecode)) | |
pipe.sadd(":".join([away_team, "players"]), pid) | |
pipe.sadd(":".join([gamecode, away_team, "players"]), pid) | |
pipe.hset(":".join([pid, "info"]), "name", pname) | |
pipe.hset(":".join((pid, gamecode)), "name", pname) | |
pipe.hset(":".join((pid, gamecode)), "opp", | |
"@{}".format(home_team[:3])) | |
for i, row in enumerate(tables[2].findAll("tr")): | |
for ptuple in get_players(row): | |
if len(row.contents) > 30: | |
pid = ptuple[0] | |
pname = ptuple[1] | |
team_dict[pid] = home_team | |
pipe.sadd(":".join([gamecode, home_team]), pid) | |
pipe.sadd("players", pid) | |
pipe.sadd(":".join([pid, str(year), "teams"]), home_team) | |
pipe.zadd(":".join([pid, "gamecodes"]), | |
gamecode, day_from_gamecode(gamecode)) | |
pipe.sadd(":".join([home_team, "players"]), pid) | |
pipe.sadd(":".join([gamecode, home_team, "players"]), pid) | |
pipe.hset(":".join([pid, "info"]), "name", pname) | |
pipe.hset(":".join((pid, gamecode)), "name", pname) | |
pipe.hset(":".join((pid, gamecode)), | |
"opp", away_team[:3] + " ") | |
pipe.execute() | |
home_dict = {home_team: "home", away_team: "away"} | |
return team_dict, home_dict | |
def clear_the_floor(onthefloor, substitutions, period): | |
""" | |
Remove everyone that's on the floor and add substitutions out at the end | |
of every period | |
""" | |
for team, players in onthefloor.items(): | |
for player in players: | |
substitutions[player].append(("out", period_starts[period])) | |
onthefloor[team].clear() | |
def substitutions_to_rows(substitutions, gamecode, team_dict): | |
""" | |
Convert list of substitutions to a second by second histogram | |
""" | |
rows = [] | |
all_seconds = np.arange(4680) + 1 | |
for player, subs in substitutions.items(): | |
if not len(subs) % 2 == 0: | |
bad_in_inds = [i for i, s in enumerate(subs) | |
if s[0] == "in" and i % 2 == 1] | |
subs.insert(bad_in_inds[0], ("out", subs[bad_in_inds[0] - 1][1])) | |
histogram = np.zeros(4680, dtype=np.uint8) | |
for i in range(0, len(subs) - 1, 2): | |
if subs[i][0] != "in": | |
print("MALFORMED SUBSTITUTION LIST!") | |
print(subs) | |
histogram[np.logical_and( | |
all_seconds >= subs[i][1], all_seconds <= subs[i + 1][1])] = 1 | |
# TODO: add teamcode | |
rows.append({"histogram": histogram[:2880], | |
"player": player, | |
"team": team_dict[player], | |
"gamecode": gamecode}) | |
return rows | |
def pack_game_histogram(histogram): | |
return np.packbits(histogram).tobytes() | |
def update_player_times(rows): | |
""" | |
""" | |
pipe = rclient.pipeline() | |
for row in rows: | |
key = ":".join((row["player"], row["gamecode"])) | |
pipe.hset(key, "hist", pack_game_histogram(row["histogram"])) | |
pipe.execute() | |
def subsample(hist): | |
return np.mean(hist.reshape(-1, 4), 1) | |
def pack_totals_histogram(total, histogram): | |
to_pack = np.hstack((np.array(total, dtype=np.float32), histogram)) | |
return to_pack.astype(np.float32).tobytes() | |
def unpack_totals_histogram(buf): | |
redis_array = np.frombuffer(buf, dtype=np.float32) | |
total = redis_array[0] | |
hist = redis_array[1:] | |
return total, hist | |
def update_player_total(pid, year, teamcode, hist): | |
""" | |
""" | |
rkey = ":".join([pid, str(year), teamcode]) | |
if rclient.exists(rkey): | |
ctotal, chist = unpack_totals_histogram(rclient.get(rkey)) | |
chist = (chist * ctotal + subsample(hist)) / (ctotal + 1) | |
rclient.set(rkey, pack_totals_histogram(ctotal + 1, chist)) | |
else: | |
rclient.set(rkey, pack_totals_histogram(1, subsample(hist))) | |
def update_player_totals(rows, year, team_dict): | |
""" | |
""" | |
for row in rows: | |
pid = row["player"] | |
teamcode = team_dict[pid] | |
hist = np.array(row["histogram"], dtype=np.int16) | |
update_player_total(pid, year, teamcode, hist) | |
def process_pbp(gamecode, debug=False): | |
""" | |
Monster function to process the entire play by play | |
""" | |
url = "boxscores/pbp/{}.html".format(gamecode) | |
html = get_page("basketball", url) | |
soup = BeautifulSoup(html, "lxml") | |
table = soup.find("table", {"class": "no_highlight stats_table"}) | |
if table is None or len(table) < 2: | |
""" | |
This means the play by play is still not available for this game, so | |
remove the downloaded html and recheck on the next update. | |
""" | |
remove_page("basketball", url) | |
return | |
period = 0 | |
cachedrows = [] | |
currgameclock = 0 | |
year = get_season(gamecode) | |
rows = table.findAll("tr") | |
team_dict, home_dict = process_boxscore(gamecode) | |
onthefloor = dict(zip(home_dict.keys(), [set(), set()])) | |
substitutions = dict([(player, []) for player in team_dict.keys()]) | |
for row in rows: | |
# If this is a quarter end, go right to clearing the cached rows | |
gameclock = None | |
if not row.has_attr("id"): | |
ptuples = get_players(row) | |
if len(ptuples) == 0: | |
continue | |
gameclock = get_gameclock(period, row.contents[1].text) | |
if gameclock == currgameclock: | |
cachedrows.append(row) | |
continue | |
# Processing game events before substitutions that happened at the | |
# same time will lead to less errors | |
exited = set() | |
entered = set() | |
appeared = set() | |
ignore_appearance = set() | |
for crow in cachedrows: | |
players = [t[0] for t in get_players(crow)] | |
# If a player didn't make the box score but is in the play by | |
# play, he probably go zero minutes. The game is almost over | |
# and this substitution doesn't matter | |
if not all([player in team_dict.keys() for player in players]): | |
continue | |
if debug: | |
print(crow) | |
if crow.text.find("enters the game") > 0: | |
ignore_appearance.add(players[0]) | |
if players[0] in exited: | |
exited.remove(players[0]) | |
else: | |
entered.add(players[0]) | |
if len(players) > 1: | |
ignore_appearance.add(players[1]) | |
if players[1] in entered: | |
entered.remove(players[1]) | |
else: | |
exited.add(players[1]) | |
# Build up the active players from the beginning of each period | |
# from in game stats as they happen | |
# Bench technical players may not be on the floor | |
# Ejected means he's not on the floor anymore | |
if ((sum(map(len, onthefloor.values())) < 10) and | |
(crow.text.find("echnical foul by") < 0) and | |
(crow.text.find("ejected from game") < 0)): | |
appeared.update(players) | |
if debug: | |
print(currgameclock) | |
print(onthefloor) | |
print("appeared: " + str(appeared)) | |
print("entered: " + str(entered)) | |
print("exited: " + str(exited)) | |
# If a player appeared during the same second that he was subbed, we | |
# can't trust the order of those things. Ignore the appearance and | |
# just trust the substitution | |
appeared = appeared.difference(ignore_appearance) | |
for player in appeared: | |
if player not in onthefloor[team_dict[player]]: | |
onthefloor[team_dict[player]].add(player) | |
substitutions[player].append(("in", period_starts[period])) | |
for player in exited: | |
if player in onthefloor[team_dict[player]]: | |
onthefloor[team_dict[player]].remove(player) | |
else: | |
substitutions[player].append(("in", period_starts[period])) | |
substitutions[player].append(("out", int(currgameclock))) | |
for player in entered: | |
onthefloor[team_dict[player]].add(player) | |
substitutions[player].append(("in", int(currgameclock))) | |
if debug: | |
print(onthefloor) | |
# Check the status of the players on the floor only after all the rows | |
# at the same clock time have been processed | |
if any(map(lambda s: len(s) > 5, onthefloor.values())): | |
print("TOO MANY PLAYERS ON THE FLOOR!") | |
print(gamecode) | |
sys.exit() | |
# If this is a quarter end, increment the period and clear the floor | |
if row.has_attr("id"): | |
period += 1 | |
clear_the_floor(onthefloor, substitutions, period) | |
cachedrows = [] | |
cachedrows.append(row) | |
currgameclock = gameclock | |
period += 1 | |
clear_the_floor(onthefloor, substitutions, period) | |
rows = substitutions_to_rows(substitutions, gamecode, team_dict) | |
if rows is None: | |
print("Missing histograms") | |
print(gamecode) | |
sys.exit() | |
update_player_times(rows) | |
update_player_totals(rows, year, team_dict) | |
return rows | |
def get_gamecodes(year): | |
""" | |
Get all the gamecodes that have occured in a year | |
""" | |
url = "leagues/NBA_{}_games.html".format(year) | |
html = get_page("basketball", url, force=True) | |
soup = BeautifulSoup(html, "lxml") | |
table = soup.find("table", {"id": "games"}) | |
rows = table.findAll("tr") | |
gamecodes = [] | |
for row in rows: | |
a = row.contents[5].find("a") | |
if a is not None: | |
gamecodes.append(a.get("href")[11:-5]) | |
return gamecodes | |
def update_database(): | |
""" | |
Find and process new games | |
""" | |
zrange = rclient.zrange("gamecodes", 0, -1) | |
processed = set([g.decode("utf-8") for g in zrange]) | |
for year in years: | |
for gamecode in list(get_gamecodes(year)): | |
if gamecode not in processed: | |
print(gamecode) | |
process_pbp(gamecode) | |
def refresh_database(): | |
""" | |
Find and process recent games | |
""" | |
zrange = rclient.zrange("gamecodes", 0, -1) | |
processed = set([g.decode("utf-8") for g in zrange]) | |
for gamecode in list(get_gamecodes(years[-1])): | |
if gamecode not in processed: | |
print(gamecode) | |
process_pbp(gamecode) | |
def rebuild_database(): | |
""" | |
Hard rebuild of the entire database | |
""" | |
rclient.flushdb() | |
for year in years: | |
for gamecode in list(get_gamecodes(year)): | |
print(gamecode) | |
process_pbp(gamecode) | |
def sync_files(): | |
""" | |
Download html files from sports reference | |
""" | |
for year in years: | |
for gamecode in list(get_gamecodes(year)): | |
get_page("basketball", "boxscores/pbp/{}.html".format(gamecode)) | |
get_page("basketball", "boxscores/{}.html".format(gamecode)) | |
def main(): | |
parser = argparse.ArgumentParser(prog="scrape") | |
parser.add_argument("task", metavar="TASK", type=str, | |
help="The task to complete. [update or rebuild]") | |
args = parser.parse_args() | |
if args.task.lower() == "update": | |
update_database() | |
if args.task.lower() == "refresh": | |
refresh_database() | |
elif args.task.lower() == "rebuild": | |
rebuild_database() | |
elif args.task.lower() == "sync": | |
sync_files() | |
else: | |
process_pbp(args.task, debug=True) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment