Created
February 26, 2024 01:08
-
-
Save tdsmith/60006dfc03cf63a5756c76048be8ad5e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "28a9d190-a0db-4587-8cb3-a771c37ba7b6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from typing import Iterable, Mapping\n", | |
"\n", | |
"import attrs\n", | |
"from frozendict import frozendict\n", | |
"from selenium import webdriver\n", | |
"from selenium.webdriver.remote.webdriver import BaseWebDriver\n", | |
"from selenium.webdriver.common.by import By\n", | |
"from selenium.webdriver.support import expected_conditions\n", | |
"from selenium.webdriver.support.wait import WebDriverWait" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "3b50f217-0086-4656-ae4d-a637b2d03935", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"@attrs.define(frozen=True)\n", | |
"class CellValue:\n", | |
" text: str\n", | |
" href: str | None\n", | |
"\n", | |
"\n", | |
"@attrs.define()\n", | |
"class CouncilMeetingSite:\n", | |
" driver: BaseWebDriver\n", | |
" wait: WebDriverWait = attrs.field()\n", | |
"\n", | |
" @wait.default\n", | |
" def _default_wait(self) -> WebDriverWait:\n", | |
" return WebDriverWait(driver=self.driver, timeout=10)\n", | |
"\n", | |
" def __attrs_post_init__(self) -> None:\n", | |
" driver.get(\"https://covapp.vancouver.ca/councilMeetingPublic/\")\n", | |
"\n", | |
" def set_date_range(self, begin: str, end: str) -> None:\n", | |
" # Selector for the submit button to execute the date range query\n", | |
" SUBMIT_SELECTOR = 'input[type=\"submit\"][value=\"Display\"]'\n", | |
" \n", | |
" self.driver.find_element(By.LINK_TEXT, \"By Date\").click()\n", | |
" self.wait.until(expected_conditions.visibility_of_element_located((By.CSS_SELECTOR, SUBMIT_SELECTOR)))\n", | |
"\n", | |
" # detect when the table is rewritten by grabbing a reference to the first row\n", | |
" old_td = self.driver.find_element(By.CSS_SELECTOR, \"table.TableRecords tr td\")\n", | |
" \n", | |
" date_range_boxes = driver.find_elements(By.CSS_SELECTOR, 'input[placeholder=\"YYYY-MM-DD\"]')\n", | |
" date_range_boxes[0].click()\n", | |
" date_range_boxes[0].clear()\n", | |
" date_range_boxes[0].send_keys(begin)\n", | |
" date_range_boxes[1].click()\n", | |
" date_range_boxes[1].clear()\n", | |
" date_range_boxes[1].send_keys(end)\n", | |
"\n", | |
" driver.find_element(By.CSS_SELECTOR, SUBMIT_SELECTOR).click()\n", | |
" self.wait.until(expected_conditions.staleness_of(old_td))\n", | |
"\n", | |
" @property\n", | |
" def table_contents(self) -> list[Mapping[str, CellValue]]:\n", | |
" rows = self.driver.find_elements(By.CSS_SELECTOR, 'table.TableRecords tr')\n", | |
" header, *rows = rows\n", | |
" labels = [th.text for th in header.find_elements(By.TAG_NAME, \"th\")]\n", | |
" \n", | |
" table = []\n", | |
" for row in rows:\n", | |
" kvs = []\n", | |
" for label, td in zip(labels, row.find_elements(By.TAG_NAME, \"td\")):\n", | |
" href = None\n", | |
" if a := td.find_elements(By.TAG_NAME, \"a\"):\n", | |
" href = a[0].get_attribute(\"href\")\n", | |
" value = CellValue(td.text, href)\n", | |
" kvs.append((label, value))\n", | |
" table.append(frozendict(kvs))\n", | |
" \n", | |
" return table\n", | |
"\n", | |
" @property\n", | |
" def is_last_page(self) -> bool:\n", | |
" next_disabled = self.driver.find_elements(By.CSS_SELECTOR, \"nav.ListNavigation_Wrapper span.ListNavigation_DisabledNext\")\n", | |
" return bool(next_disabled)\n", | |
"\n", | |
" def next_page(self):\n", | |
" assert not self.is_last_page\n", | |
" \n", | |
" # detect when the table is rewritten by grabbing a reference to the first row\n", | |
" old_td = self.driver.find_element(By.CSS_SELECTOR, \"table.TableRecords tr td\")\n", | |
"\n", | |
" link = self.driver.find_element(By.CSS_SELECTOR, \"nav.ListNavigation_Wrapper a.ListNavigation_Next\")\n", | |
" link.click()\n", | |
"\n", | |
" self.wait.until(expected_conditions.staleness_of(old_td))\n", | |
"\n", | |
" def iter_pages(self) -> Iterable[None]:\n", | |
" yield\n", | |
" while not self.is_last_page:\n", | |
" self.next_page()\n", | |
" yield" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e355d8b2-fd10-4f2d-a732-5efe8e8f2965", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"driver = webdriver.Chrome()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "4efe0a4e-28d9-4c70-a49e-442f5a4336c7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"site = CouncilMeetingSite(driver)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "b87c2fa1-f7e4-4bc6-aad7-bec3e31fed40", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"extracted = []\n", | |
"site.set_date_range(\"2024-01-01\", \"2024-02-29\")\n", | |
"for _ in site.iter_pages():\n", | |
" extracted.extend(site.table_contents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "91f54fd3-594b-4561-849c-21da7bfd5625", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"driver.quit()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "abd2f171-ed6b-45d5-87ba-cd4a8af8287f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"extracted" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "50e44e77-bb35-4dcd-bf87-21681a56b9ca", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"len(set(extracted))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment