Last active
February 8, 2016 17:59
-
-
Save enkeboll/c1fa7df2533a9939ec9a to your computer and use it in GitHub Desktop.
Scrapes pro-football-reference.com for a history of super bowl scoring summaries, for the purposes of creating "Super Bowl Boxes" stats.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
from bs4 import BeautifulSoup | |
base_url = "http://www.pro-football-reference.com" | |
next_page = "/super-bowl/i.htm" | |
# this website doesn't have a "next" button to SB XLIX, for some reason | |
extra = ['/super-bowl/xlix.htm'] | |
def pair(a, b): | |
if a < b: | |
return str(a) + str(b) | |
return str(b) + str(a) | |
scores = [] | |
sb = 0 | |
while next_page: | |
sb += 1 | |
pot = 1250 | |
print next_page | |
url = base_url + next_page | |
body = requests.get(url).text | |
soup = BeautifulSoup(body) | |
table = soup.find('table', {'id': 'scoring'}) | |
rows = table.find_all('tr') | |
rows = rows[1:] | |
score_home, score_away = 0, 0 | |
td_cnt = 0 | |
newrow = (sb, 'S', '00', 50) | |
scores.append(newrow) | |
pot -= 50 | |
for scorechange, row in enumerate(rows): | |
cols = row.find_all('td') | |
score_home_new = int(cols[-2].text) | |
score_away_new = int(cols[-1].text) | |
if cols[0].text == '3rd': | |
newrow = (sb, 'H', pair(str(score_home)[-1], str(score_away)[-1]), 150) | |
scores.append(newrow) | |
pot -= 150 | |
if score_home_new - score_home == 7: | |
newrow = (sb, str(scorechange + td_cnt + 1), pair(str(score_home_new - 1)[-1], str(score_away_new)[-1]), 37.5) | |
scores.append(newrow) | |
pot -= 37.5 | |
td_cnt += 1 | |
elif score_away_new - score_away == 7: | |
newrow = (sb, str(scorechange + td_cnt + 1), pair(str(score_home_new)[-1], str(score_away_new - 1)[-1]), 37.5) | |
scores.append(newrow) | |
pot -= 37.5 | |
td_cnt += 1 | |
newrow = (sb, str(scorechange + td_cnt + 1), pair(str(score_home_new)[-1], str(score_away_new)[-1]),37.5) | |
scores.append(newrow) | |
pot -= 37.5 | |
score_home = score_home_new | |
score_away = score_away_new | |
newrow = (sb, 'F', pair(str(score_home_new)[-1], str(score_away_new)[-1]), pot) | |
scores.append(newrow) | |
next_page = soup.find('a', href=True, text="Next") | |
if next_page: | |
next_page = next_page.get('href') | |
elif extra: | |
next_page = extra.pop() | |
with open('super_bowl_scores.csv','w') as out: | |
csv_out = csv.writer(out) | |
csv_out.writerow(['bowl_number','payout_reason', 'digits', 'payout']) | |
for line in scores: | |
csv_out.writerow(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment