Last active
November 9, 2022 14:38
-
-
Save prettyirrelevant/d1ecdefd9abecff535bb5364cb148790 to your computer and use it in GitHub Desktop.
Scrapes INEC's website to retrieve all polling units in the country.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import json | |
from typing import Any, Dict, List, Tuple | |
import aiohttp | |
from bs4 import BeautifulSoup | |
async def main() -> None: | |
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | |
states_information = await get_all_states(session) | |
# List of dicts of local government areas | |
lgas = [] | |
await asyncio.gather(*[get_all_lgas_per_state(session=session, state=state, datastore=lgas) for state in states_information]) | |
# List of dicts of wards in lgas | |
wards = [] | |
wards_tasks = [] | |
for lga in lgas: | |
wards_tasks.append(get_all_wards_per_lga(session=session, lga=lga, datastore=wards)) | |
await asyncio.gather(*wards_tasks) | |
polling_units = [] | |
# load the wards in chunks of 1000 | |
for chunked_wards in chunks(wards, 1000): | |
polling_units_tasks = [] | |
for ward in chunked_wards: | |
polling_units_tasks.append(get_all_polling_units_per_ward(session=session, ward=ward, datastore=polling_units)) | |
await asyncio.gather(*polling_units_tasks) | |
# write the polling units to a file to avoid repetition | |
with open('./polling_units.json', 'w') as f: | |
json.dump(polling_units, f, indent=4) | |
print('Wrote polling units to file successfully!') | |
print(f'Summary:\n- States: {len(states_information)}\n- LGAs: {len(lgas)}\n- Wards: {len(wards)}\n- Polling Units: {len(polling_units)}') | |
async def get_all_states(session: aiohttp.ClientSession) -> List[Tuple[str, int]]: | |
"""Get all states from INEC website by scraping the provided url.""" | |
states_info = [] | |
INEC_STATES_URL = 'https://www.inecnigeria.org/elections/polling-units' | |
print(f'Retrieving all the states from {INEC_STATES_URL}') | |
async with session.get(INEC_STATES_URL) as response: | |
if response.status == 200: | |
content = await response.text() | |
soup = BeautifulSoup(content, 'html.parser') | |
statesContainer = soup.find(id='statePoll') | |
for entry in statesContainer.children: | |
if str(entry).strip() == '': | |
continue | |
if entry.text == 'Choose State': | |
continue | |
states_info.append((entry.text, int(entry.attrs['value']))) | |
print(f'Found {len(states_info)} states\n\n') | |
return states_info | |
async def get_all_lgas_per_state( | |
session: aiohttp.ClientSession, | |
state: Tuple[str, int], | |
datastore: List[Dict[str, Any]], | |
) -> None: | |
"""Retrieve all LGAs in a state using the INEC url provided.""" | |
lgas_count = 0 | |
INEC_STATE_LGAS_URL = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/lgaView.php' | |
print(f'Retrieving all LGAs for {state[0]}({state[1]}) from {INEC_STATE_LGAS_URL}') | |
async with session.post(INEC_STATE_LGAS_URL, data={'state_id': state[1]}) as response: | |
if response.status == 200: | |
raw_content = await response.text() | |
formatted_content = json.loads(raw_content) | |
for _, lga_info in formatted_content.items(): | |
datastore.append({ | |
'lga_name': lga_info['name'], | |
'lga_id': lga_info['abbreviation'], | |
'state': state, | |
}) | |
lgas_count += 1 | |
print(f'Found {lgas_count} LGAs for {state[0]}({state[1]})\n') | |
async def get_all_wards_per_lga( | |
session: aiohttp.ClientSession, | |
lga: Dict[str, Any], | |
datastore: List[Dict[str, Any]], | |
) -> None: | |
"""Retrieve all wards present in a LGA for a particular state using the INEC url provided.""" | |
wards_count = 0 | |
state = lga['state'] | |
INEC_WARDS_URL = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/wardView.php' | |
print(f'Retrieving all wards in LGA {lga["lga_name"]}({lga["lga_id"]}) for {state[0]}({state[1]})from {INEC_WARDS_URL}') | |
async with session.post(INEC_WARDS_URL, data={'state_id': state[1], 'lga_id': lga['lga_id']}) as response: | |
if response.status == 200: | |
raw_content = await response.text() | |
formatted_content = json.loads(raw_content) | |
for _, ward_info in formatted_content.items(): | |
datastore.append({ | |
'ward_name': ward_info['name'], | |
'ward_id': ward_info['id'], | |
'lga': lga, | |
}) | |
wards_count += 1 | |
print(f'Found {wards_count} wards in LGA {lga["lga_name"]}({lga["lga_id"]}) for {state[0]}({state[1]})\n') | |
async def get_all_polling_units_per_ward( | |
session: aiohttp.ClientSession, | |
ward: Dict[str, Any], | |
datastore: List[Dict[str, Any]], | |
) -> None: | |
"""Retrieve all polling units in an ward of a LGA.""" | |
polling_units_count = 0 | |
INEC_POLLING_UNITS_URL = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/pollingView.php' | |
print(f'Retrieving all polling units from {INEC_POLLING_UNITS_URL} \nWARD: {ward["ward_name"]}({ward["ward_id"]})\nLGA: {ward["lga"]["lga_name"]}({ward["lga"]["lga_id"]})\nSTATE: {ward["lga"]["state"][0]}({ward["lga"]["state"][1]})\n') | |
payload = { | |
'state_id': ward["lga"]["state"][1], | |
'lga_id': ward["lga"]["lga_id"], | |
'ward_id': ward["ward_id"], | |
} | |
async with session.post(INEC_POLLING_UNITS_URL, data=payload) as response: | |
if response.status == 200: | |
raw_content = await response.text() | |
formatted_content = json.loads(raw_content) | |
if formatted_content is None: | |
raise Exception(f"Unabled to get polling units with due to: {raw_content}") | |
for _, polling_unit_info in formatted_content.items(): | |
info = { | |
'polling_unit': polling_unit_info, | |
'ward': ward, | |
} | |
datastore.append(info) | |
polling_units_count += 1 | |
print(f'Found {polling_units_count} polling units in WARD: {ward["ward_name"]}({ward["ward_id"]}) for LGA: {ward["lga"]["lga_name"]}({ward["lga"]["lga_id"]}) in STATE: {ward["lga"]["state"][0]}({ward["lga"]["state"][1]})\n') | |
def chunks(lst, n): | |
"""Yield successive n-sized chunks from lst.""" | |
for i in range(0, len(lst), n): | |
yield lst[i:i + n] | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
LGTM 🌚