Created
February 7, 2022 15:35
-
-
Save mjdargen/ce051215c26637a65e3dd52fb78ffa7c to your computer and use it in GitHub Desktop.
collects scores across NC FTC tournaments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup as bs | |
DIR_PATH = os.path.dirname(os.path.realpath(__file__)) | |
# retrieve soup for a page | |
def retrieve_soup(url): | |
# initialize a session | |
session = requests.Session() | |
# make the request | |
html = session.get(url) | |
# return the soup | |
return bs(html.content, "html.parser") | |
# retrieve all tables on a page | |
def retrieve_tables(soup): | |
return soup.find_all("table") | |
# retrieve headers for a table | |
def retrieve_headers(table): | |
headers = [] | |
for th in table.find("tr").find_all("th"): | |
headers.append(th.text.strip()) | |
return headers | |
# retrieve table contents | |
def retrieve_table(table): | |
rows = [] | |
for tr in table.find_all("tr")[1:]: | |
cells = [] | |
# grab all td tags in this table row | |
tds = tr.find_all("td") | |
if len(tds) == 0: | |
# if no td tags, search for th tags | |
ths = tr.find_all("th") | |
for th in ths: | |
cells.append(th.text.strip()) | |
else: | |
# use regular td tags | |
for td in tds: | |
cells.append(td.text.strip()) | |
rows.append(cells) | |
return rows | |
# save table as csv | |
def save_as_csv(filename, headers, rows): | |
headers = headers[:6] # extra data for no reason | |
rows = [row[:6] for row in rows] # extra data for no reason | |
df = pd.DataFrame(rows, columns=headers) | |
df.to_csv(f"{DIR_PATH}/{filename}.csv", index=False) | |
# search page for tables and process | |
def search_page(url): | |
# get the soup | |
soup = retrieve_soup(url) | |
# extract all the tables from the web page | |
tables = retrieve_tables(soup) | |
# iterate over all tables | |
for i, table in enumerate(tables, start=1): | |
# get the table headers | |
headers = retrieve_headers(table) | |
# get all the rows of the table | |
rows = retrieve_table(table) | |
# save table as csv file | |
filename = f"{url.split('/')[-2]}_{i}" | |
if rows: | |
print(f"Found rankings for {filename}") | |
save_as_csv(filename, headers, rows) | |
# combine data across all tournaments | |
def combine_data(): | |
if os.path.exists(f'{DIR_PATH}/nc.csv'): | |
os.remove(f'{DIR_PATH}/nc.csv') | |
csvs = [f for f in os.listdir(DIR_PATH) if '.csv' in f] | |
df = pd.DataFrame({}) | |
for csv in csvs: | |
newDF = pd.read_csv(f'{DIR_PATH}/{csv}', | |
encoding='utf-8', thousands=',') | |
newDF = newDF.drop( | |
['Tie Breaker 1', 'Tie Breaker 2', 'High Score', 'Rank'], axis=1) | |
newDF['Ranking Points'] = newDF['Ranking Points'].astype(int) | |
newDF.rename( | |
columns={'Ranking Points': csv.split('_')[0]}, inplace=True) | |
if df.empty: | |
df = df.append([df, newDF]) | |
else: | |
df = pd.merge(df, newDF, on='Team', how='outer') | |
df.fillna(' ') | |
df = df.reindex(['Team', 'USNCGRQ', 'USNCROQ', 'USNCREQ2', | |
'USNCSAQ', 'USNCREQ4', 'USNCELQ'], axis='columns') | |
df.to_csv(f'{DIR_PATH}/nc.csv', index=False) | |
# main processing, loops through all tournaments | |
def main(): | |
base = 'https://ftc-events.firstinspires.org/2021/' | |
comps = ['USNCGRQ', 'USNCROQ', 'USNCREQ2', | |
'USNCSAQ', 'USNCREQ4', 'USNCELQ'] | |
urls = [base + comp + '/rankings' for comp in comps] | |
for url in urls: | |
search_page(url) | |
combine_data() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment