Created
December 9, 2022 09:22
-
-
Save castdrian/6c6155ecabeb44b1ab53bd7d9fc19342 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
import requests | |
import bs4 | |
GEN_IX_URL = 'https://bulbapedia.bulbagarden.net/wiki/Category:Generation_IX_Pok%C3%A9mon' | |
def fetch_mon_urls(): | |
"""Fetch all the urls for the mons.""" | |
res = requests.get(GEN_IX_URL) | |
res.raise_for_status() | |
soup = bs4.BeautifulSoup(res.text, 'html.parser') | |
mon_urls = [] | |
# find the list in the div </div><div class="mw-category-generated" lang="en" dir="ltr"><div id="mw-pages"> | |
# find all the links in the list | |
# get the href attribute | |
# append to mon_urls | |
for link in soup.find('div', class_='mw-category-generated').find('div', id='mw-pages').find_all('a'): | |
mon_urls.append('https://bulbapedia.bulbagarden.net' + link.get('href')) | |
return mon_urls | |
def get_bulba_data(url): | |
"""Get the data from the Bulbapedia page.""" | |
res = requests.get(url) | |
res.raise_for_status() | |
soup = bs4.BeautifulSoup(res.text, 'html.parser') | |
return soup | |
def parse_bulba_data(soup): | |
# find roundy table | |
tables = soup.find_all('table', class_='roundy') | |
catch_rate = tables[0].find_all('a', href='/wiki/Catch_rate') | |
merged_catch_rate = catch_rate[0].parent.next_sibling.next_sibling.text | |
gender_ratio = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_gender_ratio') | |
merged_gender_ratio = gender_ratio[0].parent.next_sibling.next_sibling.text | |
# get the percentages which look like "87.5% male, 12.5% female" | |
percentages = re.findall(r'\d+\.?\d*%', merged_gender_ratio) or ['0%', '0%'] | |
# if lentgh is 1, then we get the gender text | |
if len(percentages) == 1: | |
# find one in the text | |
if 'male' in merged_gender_ratio: | |
percentages.append('0%') | |
else: | |
percentages.insert(0, '0%') | |
egg = tables[0].find_all('a', href='/wiki/Pok%C3%A9mon_breeding') | |
merged_steps = egg[0].parent.next_sibling.next_sibling.text | |
no_group = tables[0].find_all('a', href='/wiki/No_Eggs_Discovered_(Egg_Group)') | |
# check if merged_steps contains "Egg not obtainable" and return boolean | |
if 'Egg not obtainable' in merged_steps or no_group: | |
has_egg = False | |
else: | |
has_egg = True | |
# get the steps which look like "5140 - 5396 steps" | |
# use regex to get the numbers | |
min_steps = re.search(r'\d+', merged_steps).group() | |
ev_yield = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_effort_value_yield') | |
merged_ev_yield = ev_yield[0].parent.next_sibling.next_sibling.text | |
# find the first 7 numbers | |
ev_yield = re.compile(r'\d+').findall(merged_ev_yield)[:7] | |
# cut off the first one | |
ev_yield = ev_yield[1:] | |
leveling_rate = tables[0].find_all('a', href='/wiki/Experience') | |
merged_leveling_rate = leveling_rate[1].parent.next_sibling.next_sibling.text | |
leveling_rate = merged_leveling_rate.strip() | |
# grab page title <title>Sprigatito (Pokémon) - Bulbapedia, the community-driven Pokémon encyclopedia</title> | |
title = soup.find('title').text | |
# split the title by the first parenthesis | |
title = title.split('(', 1)[0].strip() | |
return { | |
'species': title.lower().strip().replace(' ', ''), | |
'genderRatio': { 'male': percentages[0], 'female': percentages[1] }, | |
'evYields': { 'hp': int(ev_yield[0]), 'atk': int(ev_yield[1]), 'def': int(ev_yield[2]), 'spa': int(ev_yield[3]), 'spd': int(ev_yield[4]), 'spe': int(ev_yield[5]) }, | |
'isEggObtainable': has_egg, | |
'catchRate': { | |
'base': int(merged_catch_rate.split(' ')[0].strip()), | |
'percentageWithOrdinaryPokeballAtFullHealth': merged_catch_rate.split(' ')[1][1:-1].replace(')', '') | |
}, | |
'levellingRate': leveling_rate, | |
'minimumHatchTime': int(min_steps) | |
} | |
def main(): | |
"""Main function.""" | |
mon_urls = fetch_mon_urls() | |
data_list = [] | |
for url in mon_urls: | |
soup = get_bulba_data(url) | |
data = parse_bulba_data(soup) | |
data_list.append(data) | |
print('Successfully processed: ' + url) | |
# write pretty json to file | |
with open('partialBulbaData.json', 'w') as f: | |
json.dump(data_list, f, indent=4) | |
print('Done!') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment