Last active
June 28, 2025 09:33
-
-
Save izackwu/8e6624e9210c6f02215aef73ea33fa29 to your computer and use it in GitHub Desktop.
Gold Coast Marathon Result Scraper (by OpenAI Codex)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Scrape complete results of Gold Coast Marathon 2024 (42KM). | |
Usage: | |
python scrape_gold_coast_marathon.py [--url URL] [--output OUTPUT] [--delay DELAY] [--max-pages N] | |
Requirements: | |
requests, beautifulsoup4 | |
""" | |
import argparse | |
import csv | |
import sys | |
import time | |
import requests | |
from bs4 import BeautifulSoup | |
def get_soup(session, url, data=None): | |
"""Fetch page content and return BeautifulSoup object.""" | |
if data: | |
response = session.post(url, data=data) | |
else: | |
response = session.get(url) | |
response.raise_for_status() | |
return BeautifulSoup(response.text, "html.parser") | |
def parse_form_data(soup): | |
"""Extract hidden form inputs for ASP.NET postback.""" | |
form = soup.find("form") | |
data = {} | |
if form: | |
for inp in form.find_all("input", type="hidden"): | |
name = inp.get("name") | |
if name: | |
data[name] = inp.get("value", "") | |
return data | |
def parse_results_table(soup): | |
"""Parse results table into headers list and list of record dicts, dropping the detail-link column.""" | |
table = soup.find("table", id="pagePlaceHolder_resTable") | |
if table is None: | |
return [], [] | |
rows = table.find_all("tr") | |
header_cells = rows[0].find_all("th") | |
keep_indices = [] | |
headers = [] | |
for idx, th in enumerate(header_cells): | |
text = th.get_text(strip=True) | |
if text: | |
headers.append(text) | |
keep_indices.append(idx) | |
results = [] | |
for row in rows[1:]: | |
cells = row.find_all("td") | |
if not cells: | |
continue | |
record = {} | |
for out_idx, cell_idx in enumerate(keep_indices): | |
cell = cells[cell_idx] | |
col = headers[out_idx] | |
if col == "Country": | |
val = cell.get("title", "").strip() | |
else: | |
val = cell.get_text(strip=True) | |
record[col] = val | |
results.append(record) | |
return headers, results | |
def has_next_page(soup): | |
"""Return True if Next Page button is enabled.""" | |
btn = soup.find(id="pagePlaceHolder_btnNextT") | |
if btn and btn.has_attr("disabled"): | |
return False | |
return True | |
def scrape_all_results(url, delay=1.0, max_pages=0): | |
"""Scrape pages of the results starting from URL (stop after max_pages if >0).""" | |
session = requests.Session() | |
soup = get_soup(session, url) | |
form_data = parse_form_data(soup) | |
headers, records = parse_results_table(soup) | |
page = 1 | |
print(f"Fetched page {page}, rows={len(records)}", file=sys.stderr) | |
while has_next_page(soup): | |
if max_pages and page >= max_pages: | |
break | |
time.sleep(delay) | |
form_data["__EVENTTARGET"] = "ctl00$pagePlaceHolder$btnNextT" | |
form_data["__EVENTARGUMENT"] = "" | |
soup = get_soup(session, url, data=form_data) | |
form_data = parse_form_data(soup) | |
page += 1 | |
_, page_records = parse_results_table(soup) | |
print(f"Fetched page {page}, rows={len(page_records)}", file=sys.stderr) | |
if not page_records: | |
break | |
records.extend(page_records) | |
return headers, records | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Scrape Gold Coast Marathon 2024 results (42KM)." | |
) | |
parser.add_argument( | |
"--url", | |
default="https://results.timingsports.com/list/goldcoastmarathon/2024/42KM/", | |
help="Results listing URL", | |
) | |
parser.add_argument( | |
"--output", | |
default="goldcoastmarathon_2024_42KM.csv", | |
help="Output CSV file path", | |
) | |
parser.add_argument( | |
"--delay", | |
type=float, | |
default=1.0, | |
help="Delay between page requests in seconds", | |
) | |
parser.add_argument( | |
"--max-pages", | |
type=int, | |
default=3, | |
help="Maximum number of pages to fetch (0 = unlimited)", | |
) | |
args = parser.parse_args() | |
headers, records = scrape_all_results( | |
args.url, delay=args.delay, max_pages=args.max_pages | |
) | |
if not records: | |
print("No records found.", file=sys.stderr) | |
sys.exit(1) | |
# Also save to CSV file | |
with open(args.output, "w", newline="", encoding="utf-8") as f: | |
w2 = csv.DictWriter(f, fieldnames=headers) | |
w2.writeheader() | |
w2.writerows(records) | |
print(f"Saved {len(records)} results to {args.output}", file=sys.stderr) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | Bib | Name | Country | Finish Time | Place | Gender | G/Pl | Division | D/Pl | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 00005 | TIMOTHY KIPKORIR KATTAM | KEN | 02:08:52 | 1 | M | 1 | M30-34 | 1 | |
2 | 00022 | BELAY TILAHUN | ETH | 02:08:58 | 2 | M | 2 | M25-29 | 1 | |
3 | 00009 | KIYOSHI KOGA | JPN | 02:09:22 | 3 | M | 3 | M25-29 | 2 | |
4 | 00012 | NAOKI AIBA | JPN | 02:10:28 | 4 | M | 4 | M30-34 | 2 | |
5 | 00002 | FELIX KANDIE | KEN | 02:10:49 | 5 | M | 5 | M35-39 | 1 | |
6 | 00020 | AKIHIRO KANEKO | JPN | 02:10:59 | 6 | M | 6 | M30-34 | 3 | |
7 | 00007 | MIZUKI HIGASHI | JPN | 02:11:21 | 7 | M | 7 | M25-29 | 3 | |
8 | 00028 | LIAM BOUDIN | AUS | 02:13:56 | 8 | M | 8 | M25-29 | 4 | |
9 | 00011 | KENJI YAMAMOTO | JPN | 02:14:24 | 9 | M | 9 | M30-34 | 4 | |
10 | 00021 | KEISUKE YOKOTA | JPN | 02:14:32 | 10 | M | 10 | M25-29 | 5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment