castdrian · December 9, 2022 09:22
diff --git a/fetch_bulba.py b/fetch_bulba.py
 import json
 import re
 import requests
 import bs4

 GEN_IX_URL = 'https://bulbapedia.bulbagarden.net/wiki/Category:Generation_IX_Pok%C3%A9mon'

 def fetch_mon_urls():
 	"""Fetch all the urls for the mons."""
 	res = requests.get(GEN_IX_URL)
 	res.raise_for_status()
 	soup = bs4.BeautifulSoup(res.text, 'html.parser')
 	mon_urls = []
 	# find the list in the div </div><div class="mw-category-generated" lang="en" dir="ltr"><div id="mw-pages">
 	# find all the links in the list
 	# get the href attribute
 	# append to mon_urls
 	for link in soup.find('div', class_='mw-category-generated').find('div', id='mw-pages').find_all('a'):
 		mon_urls.append('https://bulbapedia.bulbagarden.net' + link.get('href'))
 	return mon_urls

 def get_bulba_data(url):
 	"""Get the data from the Bulbapedia page."""
 	res = requests.get(url)
 	res.raise_for_status()
 	soup = bs4.BeautifulSoup(res.text, 'html.parser')
 	return soup

 def parse_bulba_data(soup):
 	# find roundy table
 	tables = soup.find_all('table', class_='roundy')

 	catch_rate = tables[0].find_all('a', href='/wiki/Catch_rate')
 	merged_catch_rate = catch_rate[0].parent.next_sibling.next_sibling.text

 	gender_ratio = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_gender_ratio')
 	merged_gender_ratio = gender_ratio[0].parent.next_sibling.next_sibling.text

 	# get the percentages which look like "87.5% male, 12.5% female"
 	percentages = re.findall(r'\d+\.?\d*%', merged_gender_ratio) or ['0%', '0%']

 	# if lentgh is 1, then we get the gender text
 	if len(percentages) == 1:
 		# find one in the text
 		if 'male' in merged_gender_ratio:
 			percentages.append('0%')
 		else:
 			percentages.insert(0, '0%')

 	egg = tables[0].find_all('a', href='/wiki/Pok%C3%A9mon_breeding')
 	merged_steps = egg[0].parent.next_sibling.next_sibling.text
 	
 	no_group = tables[0].find_all('a', href='/wiki/No_Eggs_Discovered_(Egg_Group)')

 	# check if merged_steps contains "Egg not obtainable" and return boolean
 	if 'Egg not obtainable' in merged_steps or no_group:
 		has_egg = False
 	else:
 		has_egg = True

 	# get the steps which look like "5140 - 5396 steps"

 	# use regex to get the numbers
 	min_steps = re.search(r'\d+', merged_steps).group()

 	ev_yield = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_effort_value_yield')
 	merged_ev_yield = ev_yield[0].parent.next_sibling.next_sibling.text

 	# find the first 7 numbers
 	ev_yield = re.compile(r'\d+').findall(merged_ev_yield)[:7]
 	# cut off the first one
 	ev_yield = ev_yield[1:]

 	leveling_rate = tables[0].find_all('a', href='/wiki/Experience')
 	merged_leveling_rate = leveling_rate[1].parent.next_sibling.next_sibling.text
 	leveling_rate = merged_leveling_rate.strip()

 	# grab page title <title>Sprigatito (Pokémon) - Bulbapedia, the community-driven Pokémon encyclopedia</title>
 	title = soup.find('title').text
 	# split the title by the first parenthesis
 	title = title.split('(', 1)[0].strip()

 	return {
 		'species': title.lower().strip().replace(' ', ''),
 		'genderRatio': { 'male': percentages[0], 'female': percentages[1] },
 		'evYields': { 'hp': int(ev_yield[0]), 'atk': int(ev_yield[1]), 'def': int(ev_yield[2]), 'spa': int(ev_yield[3]), 'spd': int(ev_yield[4]), 'spe': int(ev_yield[5]) },
 		'isEggObtainable': has_egg,
 		'catchRate': {
 			'base': int(merged_catch_rate.split(' ')[0].strip()),
 			'percentageWithOrdinaryPokeballAtFullHealth': merged_catch_rate.split(' ')[1][1:-1].replace(')', '')
 		},
 		'levellingRate': leveling_rate,
 		'minimumHatchTime': int(min_steps)
 	}

 def main():
 	"""Main function."""
 	mon_urls = fetch_mon_urls()
 	
 	data_list = []

 	for url in mon_urls:
 		soup = get_bulba_data(url)
 		data = parse_bulba_data(soup)
 		data_list.append(data)
 		print('Successfully processed: ' + url)

 	# write pretty json to file
 	with open('partialBulbaData.json', 'w') as f:
 		json.dump(data_list, f, indent=4)

 	print('Done!')

 if __name__ == '__main__':
 	main()
	import json
	import re
	import requests
	import bs4

	GEN_IX_URL = 'https://bulbapedia.bulbagarden.net/wiki/Category:Generation_IX_Pok%C3%A9mon'

	def fetch_mon_urls():
	"""Fetch all the urls for the mons."""
	res = requests.get(GEN_IX_URL)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	mon_urls = []
	# find the list in the div </div><div class="mw-category-generated" lang="en" dir="ltr"><div id="mw-pages">
	# find all the links in the list
	# get the href attribute
	# append to mon_urls
	for link in soup.find('div', class_='mw-category-generated').find('div', id='mw-pages').find_all('a'):
	mon_urls.append('https://bulbapedia.bulbagarden.net' + link.get('href'))
	return mon_urls

	def get_bulba_data(url):
	"""Get the data from the Bulbapedia page."""
	res = requests.get(url)
	res.raise_for_status()
	soup = bs4.BeautifulSoup(res.text, 'html.parser')
	return soup

	def parse_bulba_data(soup):
	# find roundy table
	tables = soup.find_all('table', class_='roundy')

	catch_rate = tables[0].find_all('a', href='/wiki/Catch_rate')
	merged_catch_rate = catch_rate[0].parent.next_sibling.next_sibling.text

	gender_ratio = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_gender_ratio')
	merged_gender_ratio = gender_ratio[0].parent.next_sibling.next_sibling.text

	# get the percentages which look like "87.5% male, 12.5% female"
	percentages = re.findall(r'\d+\.?\d*%', merged_gender_ratio) or ['0%', '0%']

	# if lentgh is 1, then we get the gender text
	if len(percentages) == 1:
	# find one in the text
	if 'male' in merged_gender_ratio:
	percentages.append('0%')
	else:
	percentages.insert(0, '0%')

	egg = tables[0].find_all('a', href='/wiki/Pok%C3%A9mon_breeding')
	merged_steps = egg[0].parent.next_sibling.next_sibling.text

	no_group = tables[0].find_all('a', href='/wiki/No_Eggs_Discovered_(Egg_Group)')

	# check if merged_steps contains "Egg not obtainable" and return boolean
	if 'Egg not obtainable' in merged_steps or no_group:
	has_egg = False
	else:
	has_egg = True

	# get the steps which look like "5140 - 5396 steps"

	# use regex to get the numbers
	min_steps = re.search(r'\d+', merged_steps).group()

	ev_yield = tables[0].find_all('a', href='/wiki/List_of_Pok%C3%A9mon_by_effort_value_yield')
	merged_ev_yield = ev_yield[0].parent.next_sibling.next_sibling.text

	# find the first 7 numbers
	ev_yield = re.compile(r'\d+').findall(merged_ev_yield)[:7]
	# cut off the first one
	ev_yield = ev_yield[1:]

	leveling_rate = tables[0].find_all('a', href='/wiki/Experience')
	merged_leveling_rate = leveling_rate[1].parent.next_sibling.next_sibling.text
	leveling_rate = merged_leveling_rate.strip()

	# grab page title <title>Sprigatito (Pokémon) - Bulbapedia, the community-driven Pokémon encyclopedia</title>
	title = soup.find('title').text
	# split the title by the first parenthesis
	title = title.split('(', 1)[0].strip()

	return {
	'species': title.lower().strip().replace(' ', ''),
	'genderRatio': { 'male': percentages[0], 'female': percentages[1] },
	'evYields': { 'hp': int(ev_yield[0]), 'atk': int(ev_yield[1]), 'def': int(ev_yield[2]), 'spa': int(ev_yield[3]), 'spd': int(ev_yield[4]), 'spe': int(ev_yield[5]) },
	'isEggObtainable': has_egg,
	'catchRate': {
	'base': int(merged_catch_rate.split(' ')[0].strip()),
	'percentageWithOrdinaryPokeballAtFullHealth': merged_catch_rate.split(' ')[1][1:-1].replace(')', '')
	},
	'levellingRate': leveling_rate,
	'minimumHatchTime': int(min_steps)
	}

	def main():
	"""Main function."""
	mon_urls = fetch_mon_urls()

	data_list = []

	for url in mon_urls:
	soup = get_bulba_data(url)
	data = parse_bulba_data(soup)
	data_list.append(data)
	print('Successfully processed: ' + url)

	# write pretty json to file
	with open('partialBulbaData.json', 'w') as f:
	json.dump(data_list, f, indent=4)

	print('Done!')

	if __name__ == '__main__':
	main()