Skip to content

Instantly share code, notes, and snippets.

@mjdargen
Created February 7, 2022 15:35
Show Gist options
  • Save mjdargen/ce051215c26637a65e3dd52fb78ffa7c to your computer and use it in GitHub Desktop.
Save mjdargen/ce051215c26637a65e3dd52fb78ffa7c to your computer and use it in GitHub Desktop.
collects scores across NC FTC tournaments
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
# retrieve soup for a page
def retrieve_soup(url):
# initialize a session
session = requests.Session()
# make the request
html = session.get(url)
# return the soup
return bs(html.content, "html.parser")
# retrieve all tables on a page
def retrieve_tables(soup):
return soup.find_all("table")
# retrieve headers for a table
def retrieve_headers(table):
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
return headers
# retrieve table contents
def retrieve_table(table):
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
if len(tds) == 0:
# if no td tags, search for th tags
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
return rows
# save table as csv
def save_as_csv(filename, headers, rows):
headers = headers[:6] # extra data for no reason
rows = [row[:6] for row in rows] # extra data for no reason
df = pd.DataFrame(rows, columns=headers)
df.to_csv(f"{DIR_PATH}/{filename}.csv", index=False)
# search page for tables and process
def search_page(url):
# get the soup
soup = retrieve_soup(url)
# extract all the tables from the web page
tables = retrieve_tables(soup)
# iterate over all tables
for i, table in enumerate(tables, start=1):
# get the table headers
headers = retrieve_headers(table)
# get all the rows of the table
rows = retrieve_table(table)
# save table as csv file
filename = f"{url.split('/')[-2]}_{i}"
if rows:
print(f"Found rankings for {filename}")
save_as_csv(filename, headers, rows)
# combine data across all tournaments
def combine_data():
if os.path.exists(f'{DIR_PATH}/nc.csv'):
os.remove(f'{DIR_PATH}/nc.csv')
csvs = [f for f in os.listdir(DIR_PATH) if '.csv' in f]
df = pd.DataFrame({})
for csv in csvs:
newDF = pd.read_csv(f'{DIR_PATH}/{csv}',
encoding='utf-8', thousands=',')
newDF = newDF.drop(
['Tie Breaker 1', 'Tie Breaker 2', 'High Score', 'Rank'], axis=1)
newDF['Ranking Points'] = newDF['Ranking Points'].astype(int)
newDF.rename(
columns={'Ranking Points': csv.split('_')[0]}, inplace=True)
if df.empty:
df = df.append([df, newDF])
else:
df = pd.merge(df, newDF, on='Team', how='outer')
df.fillna(' ')
df = df.reindex(['Team', 'USNCGRQ', 'USNCROQ', 'USNCREQ2',
'USNCSAQ', 'USNCREQ4', 'USNCELQ'], axis='columns')
df.to_csv(f'{DIR_PATH}/nc.csv', index=False)
# main processing, loops through all tournaments
def main():
base = 'https://ftc-events.firstinspires.org/2021/'
comps = ['USNCGRQ', 'USNCROQ', 'USNCREQ2',
'USNCSAQ', 'USNCREQ4', 'USNCELQ']
urls = [base + comp + '/rankings' for comp in comps]
for url in urls:
search_page(url)
combine_data()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment