mjdargen · February 7, 2022 15:35
diff --git a/nc_ftc_scouting.py b/nc_ftc_scouting.py
 import os
 import requests
 import pandas as pd
 from bs4 import BeautifulSoup as bs

 DIR_PATH = os.path.dirname(os.path.realpath(__file__))


 # retrieve soup for a page
 def retrieve_soup(url):
    # initialize a session
    session = requests.Session()
    # make the request
    html = session.get(url)
    # return the soup
    return bs(html.content, "html.parser")


 # retrieve all tables on a page
 def retrieve_tables(soup):
    return soup.find_all("table")


 # retrieve headers for a table
 def retrieve_headers(table):
    headers = []
    for th in table.find("tr").find_all("th"):
        headers.append(th.text.strip())
    return headers


 # retrieve table contents
 def retrieve_table(table):
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = []
        # grab all td tags in this table row
        tds = tr.find_all("td")
        if len(tds) == 0:
            # if no td tags, search for th tags
            ths = tr.find_all("th")
            for th in ths:
                cells.append(th.text.strip())
        else:
            # use regular td tags
            for td in tds:
                cells.append(td.text.strip())
        rows.append(cells)
    return rows


 # save table as csv
 def save_as_csv(filename, headers, rows):
    headers = headers[:6]  # extra data for no reason
    rows = [row[:6] for row in rows]  # extra data for no reason
    df = pd.DataFrame(rows, columns=headers)
    df.to_csv(f"{DIR_PATH}/{filename}.csv", index=False)


 # search page for tables and process
 def search_page(url):
    # get the soup
    soup = retrieve_soup(url)
    # extract all the tables from the web page
    tables = retrieve_tables(soup)
    # iterate over all tables
    for i, table in enumerate(tables, start=1):
        # get the table headers
        headers = retrieve_headers(table)
        # get all the rows of the table
        rows = retrieve_table(table)
        # save table as csv file
        filename = f"{url.split('/')[-2]}_{i}"
        if rows:
            print(f"Found rankings for {filename}")
            save_as_csv(filename, headers, rows)


 # combine data across all tournaments
 def combine_data():
    if os.path.exists(f'{DIR_PATH}/nc.csv'):
        os.remove(f'{DIR_PATH}/nc.csv')
    csvs = [f for f in os.listdir(DIR_PATH) if '.csv' in f]

    df = pd.DataFrame({})
    for csv in csvs:
        newDF = pd.read_csv(f'{DIR_PATH}/{csv}',
                            encoding='utf-8', thousands=',')
        newDF = newDF.drop(
            ['Tie Breaker 1', 'Tie Breaker 2', 'High Score', 'Rank'], axis=1)
        newDF['Ranking Points'] = newDF['Ranking Points'].astype(int)
        newDF.rename(
            columns={'Ranking Points': csv.split('_')[0]}, inplace=True)
        if df.empty:
            df = df.append([df, newDF])
        else:
            df = pd.merge(df, newDF, on='Team', how='outer')

    df.fillna(' ')
    df = df.reindex(['Team', 'USNCGRQ', 'USNCROQ', 'USNCREQ2',
                     'USNCSAQ', 'USNCREQ4', 'USNCELQ'], axis='columns')
    df.to_csv(f'{DIR_PATH}/nc.csv', index=False)


 # main processing, loops through all tournaments
 def main():
    base = 'https://ftc-events.firstinspires.org/2021/'
    comps = ['USNCGRQ', 'USNCROQ', 'USNCREQ2',
             'USNCSAQ', 'USNCREQ4', 'USNCELQ']
    urls = [base + comp + '/rankings' for comp in comps]
    for url in urls:
        search_page(url)
    combine_data()


 if __name__ == "__main__":
    main()
	import os
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup as bs

	DIR_PATH = os.path.dirname(os.path.realpath(__file__))


	# retrieve soup for a page
	def retrieve_soup(url):
	# initialize a session
	session = requests.Session()
	# make the request
	html = session.get(url)
	# return the soup
	return bs(html.content, "html.parser")


	# retrieve all tables on a page
	def retrieve_tables(soup):
	return soup.find_all("table")


	# retrieve headers for a table
	def retrieve_headers(table):
	headers = []
	for th in table.find("tr").find_all("th"):
	headers.append(th.text.strip())
	return headers


	# retrieve table contents
	def retrieve_table(table):
	rows = []
	for tr in table.find_all("tr")[1:]:
	cells = []
	# grab all td tags in this table row
	tds = tr.find_all("td")
	if len(tds) == 0:
	# if no td tags, search for th tags
	ths = tr.find_all("th")
	for th in ths:
	cells.append(th.text.strip())
	else:
	# use regular td tags
	for td in tds:
	cells.append(td.text.strip())
	rows.append(cells)
	return rows


	# save table as csv
	def save_as_csv(filename, headers, rows):
	headers = headers[:6] # extra data for no reason
	rows = [row[:6] for row in rows] # extra data for no reason
	df = pd.DataFrame(rows, columns=headers)
	df.to_csv(f"{DIR_PATH}/{filename}.csv", index=False)


	# search page for tables and process
	def search_page(url):
	# get the soup
	soup = retrieve_soup(url)
	# extract all the tables from the web page
	tables = retrieve_tables(soup)
	# iterate over all tables
	for i, table in enumerate(tables, start=1):
	# get the table headers
	headers = retrieve_headers(table)
	# get all the rows of the table
	rows = retrieve_table(table)
	# save table as csv file
	filename = f"{url.split('/')[-2]}_{i}"
	if rows:
	print(f"Found rankings for {filename}")
	save_as_csv(filename, headers, rows)


	# combine data across all tournaments
	def combine_data():
	if os.path.exists(f'{DIR_PATH}/nc.csv'):
	os.remove(f'{DIR_PATH}/nc.csv')
	csvs = [f for f in os.listdir(DIR_PATH) if '.csv' in f]

	df = pd.DataFrame({})
	for csv in csvs:
	newDF = pd.read_csv(f'{DIR_PATH}/{csv}',
	encoding='utf-8', thousands=',')
	newDF = newDF.drop(
	['Tie Breaker 1', 'Tie Breaker 2', 'High Score', 'Rank'], axis=1)
	newDF['Ranking Points'] = newDF['Ranking Points'].astype(int)
	newDF.rename(
	columns={'Ranking Points': csv.split('_')[0]}, inplace=True)
	if df.empty:
	df = df.append([df, newDF])
	else:
	df = pd.merge(df, newDF, on='Team', how='outer')

	df.fillna(' ')
	df = df.reindex(['Team', 'USNCGRQ', 'USNCROQ', 'USNCREQ2',
	'USNCSAQ', 'USNCREQ4', 'USNCELQ'], axis='columns')
	df.to_csv(f'{DIR_PATH}/nc.csv', index=False)


	# main processing, loops through all tournaments
	def main():
	base = 'https://ftc-events.firstinspires.org/2021/'
	comps = ['USNCGRQ', 'USNCROQ', 'USNCREQ2',
	'USNCSAQ', 'USNCREQ4', 'USNCELQ']
	urls = [base + comp + '/rankings' for comp in comps]
	for url in urls:
	search_page(url)
	combine_data()


	if __name__ == "__main__":
	main()