Skip to content

Instantly share code, notes, and snippets.

@izackwu
Last active June 28, 2025 09:33
Show Gist options
  • Save izackwu/8e6624e9210c6f02215aef73ea33fa29 to your computer and use it in GitHub Desktop.
Save izackwu/8e6624e9210c6f02215aef73ea33fa29 to your computer and use it in GitHub Desktop.
Gold Coast Marathon Result Scraper (by OpenAI Codex)
#!/usr/bin/env python3
"""Scrape complete results of Gold Coast Marathon 2024 (42KM).
Usage:
python scrape_gold_coast_marathon.py [--url URL] [--output OUTPUT] [--delay DELAY] [--max-pages N]
Requirements:
requests, beautifulsoup4
"""
import argparse
import csv
import sys
import time
import requests
from bs4 import BeautifulSoup
def get_soup(session, url, data=None):
"""Fetch page content and return BeautifulSoup object."""
if data:
response = session.post(url, data=data)
else:
response = session.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
def parse_form_data(soup):
"""Extract hidden form inputs for ASP.NET postback."""
form = soup.find("form")
data = {}
if form:
for inp in form.find_all("input", type="hidden"):
name = inp.get("name")
if name:
data[name] = inp.get("value", "")
return data
def parse_results_table(soup):
"""Parse results table into headers list and list of record dicts, dropping the detail-link column."""
table = soup.find("table", id="pagePlaceHolder_resTable")
if table is None:
return [], []
rows = table.find_all("tr")
header_cells = rows[0].find_all("th")
keep_indices = []
headers = []
for idx, th in enumerate(header_cells):
text = th.get_text(strip=True)
if text:
headers.append(text)
keep_indices.append(idx)
results = []
for row in rows[1:]:
cells = row.find_all("td")
if not cells:
continue
record = {}
for out_idx, cell_idx in enumerate(keep_indices):
cell = cells[cell_idx]
col = headers[out_idx]
if col == "Country":
val = cell.get("title", "").strip()
else:
val = cell.get_text(strip=True)
record[col] = val
results.append(record)
return headers, results
def has_next_page(soup):
"""Return True if Next Page button is enabled."""
btn = soup.find(id="pagePlaceHolder_btnNextT")
if btn and btn.has_attr("disabled"):
return False
return True
def scrape_all_results(url, delay=1.0, max_pages=0):
"""Scrape pages of the results starting from URL (stop after max_pages if >0)."""
session = requests.Session()
soup = get_soup(session, url)
form_data = parse_form_data(soup)
headers, records = parse_results_table(soup)
page = 1
print(f"Fetched page {page}, rows={len(records)}", file=sys.stderr)
while has_next_page(soup):
if max_pages and page >= max_pages:
break
time.sleep(delay)
form_data["__EVENTTARGET"] = "ctl00$pagePlaceHolder$btnNextT"
form_data["__EVENTARGUMENT"] = ""
soup = get_soup(session, url, data=form_data)
form_data = parse_form_data(soup)
page += 1
_, page_records = parse_results_table(soup)
print(f"Fetched page {page}, rows={len(page_records)}", file=sys.stderr)
if not page_records:
break
records.extend(page_records)
return headers, records
def main():
parser = argparse.ArgumentParser(
description="Scrape Gold Coast Marathon 2024 results (42KM)."
)
parser.add_argument(
"--url",
default="https://results.timingsports.com/list/goldcoastmarathon/2024/42KM/",
help="Results listing URL",
)
parser.add_argument(
"--output",
default="goldcoastmarathon_2024_42KM.csv",
help="Output CSV file path",
)
parser.add_argument(
"--delay",
type=float,
default=1.0,
help="Delay between page requests in seconds",
)
parser.add_argument(
"--max-pages",
type=int,
default=3,
help="Maximum number of pages to fetch (0 = unlimited)",
)
args = parser.parse_args()
headers, records = scrape_all_results(
args.url, delay=args.delay, max_pages=args.max_pages
)
if not records:
print("No records found.", file=sys.stderr)
sys.exit(1)
# Also save to CSV file
with open(args.output, "w", newline="", encoding="utf-8") as f:
w2 = csv.DictWriter(f, fieldnames=headers)
w2.writeheader()
w2.writerows(records)
print(f"Saved {len(records)} results to {args.output}", file=sys.stderr)
if __name__ == "__main__":
main()
# Bib Name Country Finish Time Place Gender G/Pl Division D/Pl
1 00005 TIMOTHY KIPKORIR KATTAM KEN 02:08:52 1 M 1 M30-34 1
2 00022 BELAY TILAHUN ETH 02:08:58 2 M 2 M25-29 1
3 00009 KIYOSHI KOGA JPN 02:09:22 3 M 3 M25-29 2
4 00012 NAOKI AIBA JPN 02:10:28 4 M 4 M30-34 2
5 00002 FELIX KANDIE KEN 02:10:49 5 M 5 M35-39 1
6 00020 AKIHIRO KANEKO JPN 02:10:59 6 M 6 M30-34 3
7 00007 MIZUKI HIGASHI JPN 02:11:21 7 M 7 M25-29 3
8 00028 LIAM BOUDIN AUS 02:13:56 8 M 8 M25-29 4
9 00011 KENJI YAMAMOTO JPN 02:14:24 9 M 9 M30-34 4
10 00021 KEISUKE YOKOTA JPN 02:14:32 10 M 10 M25-29 5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment