Skip to content

Instantly share code, notes, and snippets.

@zardoru
Created March 1, 2018 13:33
Show Gist options
  • Save zardoru/592e13fb4c2140bf77a1fae8f3549bec to your computer and use it in GitHub Desktop.
Save zardoru/592e13fb4c2140bf77a1fae8f3549bec to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import unquote, quote
from urllib.error import HTTPError
pages = [
"http://mtwildwood.net/yokaiMedallium/tribeBrave.html",
"http://mtwildwood.net/yokaiMedallium/tribeMysterious.html",
"http://mtwildwood.net/yokaiMedallium/tribeTough.html",
"http://mtwildwood.net/yokaiMedallium/tribeShady.html",
"http://mtwildwood.net/yokaiMedallium/tribeHeartful.html",
"http://mtwildwood.net/yokaiMedallium/tribeCharming.html",
"http://mtwildwood.net/yokaiMedallium/tribeEerie.html",
"http://mtwildwood.net/yokaiMedallium/tribeSlippery.html",
"http://mtwildwood.net/yokaiMedallium/tribeKaima.html"
]
links = []
for page in pages:
print("Reading {}".format(page))
f = urlopen(page)
html = f.read()
soap = BeautifulSoup(html, "lxml")
for med in soap.find_all(class_="medallium-icon"):
links.append("http://mtwildwood.net/yokaiMedallium/" + med.a.get("href"))
print ("Links acquired. Going through descriptions...")
out = open("lines-yokai.txt", "w")
for link in links:
print("Reading {}".format(link))
try:
f = urlopen(link)
except HTTPError as e:
print ("Couldn't read - {}".format(e))
continue
html = f.read()
soap = BeautifulSoup(html, "lxml")
d = soap.find(class_="description")
if d is None:
continue
s = d.get_text().replace("\n", "").replace("Yo-kai Watch 2", "").strip()
out.write(s + "\n")
out.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment