Skip to content

Instantly share code, notes, and snippets.

@prettyirrelevant
Last active November 9, 2022 14:38
Show Gist options
  • Save prettyirrelevant/d1ecdefd9abecff535bb5364cb148790 to your computer and use it in GitHub Desktop.
Save prettyirrelevant/d1ecdefd9abecff535bb5364cb148790 to your computer and use it in GitHub Desktop.
Scrapes INEC's website to retrieve all polling units in the country.
import asyncio
import json
from typing import Any, Dict, List, Tuple
import aiohttp
from bs4 import BeautifulSoup
async def main() -> None:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
states_information = await get_all_states(session)
# List of dicts of local government areas
lgas = []
await asyncio.gather(*[get_all_lgas_per_state(session=session, state=state, datastore=lgas) for state in states_information])
# List of dicts of wards in lgas
wards = []
wards_tasks = []
for lga in lgas:
wards_tasks.append(get_all_wards_per_lga(session=session, lga=lga, datastore=wards))
await asyncio.gather(*wards_tasks)
polling_units = []
# load the wards in chunks of 1000
for chunked_wards in chunks(wards, 1000):
polling_units_tasks = []
for ward in chunked_wards:
polling_units_tasks.append(get_all_polling_units_per_ward(session=session, ward=ward, datastore=polling_units))
await asyncio.gather(*polling_units_tasks)
# write the polling units to a file to avoid repetition
with open('./polling_units.json', 'w') as f:
json.dump(polling_units, f, indent=4)
print('Wrote polling units to file successfully!')
print(f'Summary:\n- States: {len(states_information)}\n- LGAs: {len(lgas)}\n- Wards: {len(wards)}\n- Polling Units: {len(polling_units)}')
async def get_all_states(session: aiohttp.ClientSession) -> List[Tuple[str, int]]:
"""Get all states from INEC website by scraping the provided url."""
states_info = []
INEC_STATES_URL = 'https://www.inecnigeria.org/elections/polling-units'
print(f'Retrieving all the states from {INEC_STATES_URL}')
async with session.get(INEC_STATES_URL) as response:
if response.status == 200:
content = await response.text()
soup = BeautifulSoup(content, 'html.parser')
statesContainer = soup.find(id='statePoll')
for entry in statesContainer.children:
if str(entry).strip() == '':
continue
if entry.text == 'Choose State':
continue
states_info.append((entry.text, int(entry.attrs['value'])))
print(f'Found {len(states_info)} states\n\n')
return states_info
async def get_all_lgas_per_state(
session: aiohttp.ClientSession,
state: Tuple[str, int],
datastore: List[Dict[str, Any]],
) -> None:
"""Retrieve all LGAs in a state using the INEC url provided."""
lgas_count = 0
INEC_STATE_LGAS_URL = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/lgaView.php'
print(f'Retrieving all LGAs for {state[0]}({state[1]}) from {INEC_STATE_LGAS_URL}')
async with session.post(INEC_STATE_LGAS_URL, data={'state_id': state[1]}) as response:
if response.status == 200:
raw_content = await response.text()
formatted_content = json.loads(raw_content)
for _, lga_info in formatted_content.items():
datastore.append({
'lga_name': lga_info['name'],
'lga_id': lga_info['abbreviation'],
'state': state,
})
lgas_count += 1
print(f'Found {lgas_count} LGAs for {state[0]}({state[1]})\n')
async def get_all_wards_per_lga(
session: aiohttp.ClientSession,
lga: Dict[str, Any],
datastore: List[Dict[str, Any]],
) -> None:
"""Retrieve all wards present in a LGA for a particular state using the INEC url provided."""
wards_count = 0
state = lga['state']
INEC_WARDS_URL = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/wardView.php'
print(f'Retrieving all wards in LGA {lga["lga_name"]}({lga["lga_id"]}) for {state[0]}({state[1]})from {INEC_WARDS_URL}')
async with session.post(INEC_WARDS_URL, data={'state_id': state[1], 'lga_id': lga['lga_id']}) as response:
if response.status == 200:
raw_content = await response.text()
formatted_content = json.loads(raw_content)
for _, ward_info in formatted_content.items():
datastore.append({
'ward_name': ward_info['name'],
'ward_id': ward_info['id'],
'lga': lga,
})
wards_count += 1
print(f'Found {wards_count} wards in LGA {lga["lga_name"]}({lga["lga_id"]}) for {state[0]}({state[1]})\n')
async def get_all_polling_units_per_ward(
session: aiohttp.ClientSession,
ward: Dict[str, Any],
datastore: List[Dict[str, Any]],
) -> None:
"""Retrieve all polling units in an ward of a LGA."""
polling_units_count = 0
INEC_POLLING_UNITS_URL = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/pollingView.php'
print(f'Retrieving all polling units from {INEC_POLLING_UNITS_URL} \nWARD: {ward["ward_name"]}({ward["ward_id"]})\nLGA: {ward["lga"]["lga_name"]}({ward["lga"]["lga_id"]})\nSTATE: {ward["lga"]["state"][0]}({ward["lga"]["state"][1]})\n')
payload = {
'state_id': ward["lga"]["state"][1],
'lga_id': ward["lga"]["lga_id"],
'ward_id': ward["ward_id"],
}
async with session.post(INEC_POLLING_UNITS_URL, data=payload) as response:
if response.status == 200:
raw_content = await response.text()
formatted_content = json.loads(raw_content)
if formatted_content is None:
raise Exception(f"Unabled to get polling units with due to: {raw_content}")
for _, polling_unit_info in formatted_content.items():
info = {
'polling_unit': polling_unit_info,
'ward': ward,
}
datastore.append(info)
polling_units_count += 1
print(f'Found {polling_units_count} polling units in WARD: {ward["ward_name"]}({ward["ward_id"]}) for LGA: {ward["lga"]["lga_name"]}({ward["lga"]["lga_id"]}) in STATE: {ward["lga"]["state"][0]}({ward["lga"]["state"][1]})\n')
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
@Tee-py
Copy link

Tee-py commented Nov 4, 2022

LGTM 🌚

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment