luckytyphlosion · October 11, 2021 01:13
diff --git a/parse_wiimm_html.py b/parse_wiimm_html.py
 # =============================================================================
 # MIT License
 # 
 # Copyright (c) 2021 luckytyphlosion
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 # 
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 # 
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 # =============================================================================
 import bs4
 import requests
 import json
 import re

 family_info_row_id_regex = re.compile(r"^p1-[0-9]+-0$")

 def main():
    track_id = 1293

    MODE = 1
    if MODE == 0:
        r = requests.get(f"https://ct.wiimm.de/i/{track_id}")
        html_doc = r.text
        with open(f"ct_wiimm_de_{track_id}.html", "w+") as f:
            f.write(html_doc)
    elif MODE == 1:
        with open(f"ct_wiimm_de_{track_id}.html", "r") as f:
            html_doc = f.read()

    html_doc = html_doc.replace(u"\u00A0", " ")
    track_info = {}
    soup = bs4.BeautifulSoup(html_doc, "html.parser")
    table_info = soup.find("table", class_="table-info")
    track_info["trackNameFull"] = table_info.find("th").string

    # ================================
    # alternate code which finds the type_class_id via tag position
    # don't do this because some tags may be missing depending on the track
    # see: https://ct.wiimm.de/i/1700 https://ct.wiimm.de/i/1293 https://ct.wiimm.de/i/1500
    # table_rows = table_info.find_all_next("tr")
    # type_class_id = table_rows[1].find_all_next("td")[1].string
    # ================================
    type_class_id = table_info.find("td", string="Type, Class and Id:").next_sibling.string
    track_type, track_class, track_id_as_str = type_class_id.split(" / ")
    if track_id != int(track_id_as_str):
        raise RuntimeError("Expected track_id == int(track_id_as_str)!")

    archive_info = {
        "type": track_type,
        "class": track_class,
        "id": track_id
    }
    track_info["archiveInfo"] = archive_info
    track_name_and_version_tag = table_info.find("td", string="Track name and version:").next_sibling
    track_name = track_name_and_version_tag.find("b").string
    track_version = track_name_and_version_tag.contents[1].strip()
    track_info["name"] = track_name
    track_info["version"] = track_version
    track_info["author"] = table_info.find("td", string="Created by:").next_sibling.string
    
    # ==============================
    # alternate code to get archive info from the family box instead
    #family_info = soup.find("table", id="p1-table")
    #family_tracks = family_info.find_all("tr", id=family_info_row_id_regex)
    #
    #for family_track in family_tracks:
    #    family_track_tds = family_track.find_all_next('td')
    #    family_track_id = int(family_track_tds[1].string)
    #    if family_track_id == track_id:
    #        archive_info = {}
    #        archive_info["type"] = family_track.find("td", class_="ctype-1").string
    #        archive_info["class"] = family_track.find("td", class_="cclass").string
    #        archive_info["id"] = family_track_id
    #        track_info["archiveInfo"] = archive_info
    #        break
    #
    #   print(f"family_track_id: {family_track_id}")
    #   #print(f"track id: {family_track.find_all_next('td')[1]}")
    #   #print(f"family_track.contents: {family_track.contents}")
    # ==============================

    with open(f"track_info_{track_id}.json", "w+") as f:
        json.dump(track_info, f, indent=2)

 if __name__ == "__main__":
    main()
	# =============================================================================
	# MIT License
	#
	# Copyright (c) 2021 luckytyphlosion
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	# =============================================================================
	import bs4
	import requests
	import json
	import re

	family_info_row_id_regex = re.compile(r"^p1-[0-9]+-0$")

	def main():
	track_id = 1293

	MODE = 1
	if MODE == 0:
	r = requests.get(f"https://ct.wiimm.de/i/{track_id}")
	html_doc = r.text
	with open(f"ct_wiimm_de_{track_id}.html", "w+") as f:
	f.write(html_doc)
	elif MODE == 1:
	with open(f"ct_wiimm_de_{track_id}.html", "r") as f:
	html_doc = f.read()

	html_doc = html_doc.replace(u"\u00A0", " ")
	track_info = {}
	soup = bs4.BeautifulSoup(html_doc, "html.parser")
	table_info = soup.find("table", class_="table-info")
	track_info["trackNameFull"] = table_info.find("th").string

	# ================================
	# alternate code which finds the type_class_id via tag position
	# don't do this because some tags may be missing depending on the track
	# see: https://ct.wiimm.de/i/1700 https://ct.wiimm.de/i/1293 https://ct.wiimm.de/i/1500
	# table_rows = table_info.find_all_next("tr")
	# type_class_id = table_rows[1].find_all_next("td")[1].string
	# ================================
	type_class_id = table_info.find("td", string="Type, Class and Id:").next_sibling.string
	track_type, track_class, track_id_as_str = type_class_id.split(" / ")
	if track_id != int(track_id_as_str):
	raise RuntimeError("Expected track_id == int(track_id_as_str)!")

	archive_info = {
	"type": track_type,
	"class": track_class,
	"id": track_id
	}
	track_info["archiveInfo"] = archive_info
	track_name_and_version_tag = table_info.find("td", string="Track name and version:").next_sibling
	track_name = track_name_and_version_tag.find("b").string
	track_version = track_name_and_version_tag.contents[1].strip()
	track_info["name"] = track_name
	track_info["version"] = track_version
	track_info["author"] = table_info.find("td", string="Created by:").next_sibling.string

	# ==============================
	# alternate code to get archive info from the family box instead
	#family_info = soup.find("table", id="p1-table")
	#family_tracks = family_info.find_all("tr", id=family_info_row_id_regex)
	#
	#for family_track in family_tracks:
	# family_track_tds = family_track.find_all_next('td')
	# family_track_id = int(family_track_tds[1].string)
	# if family_track_id == track_id:
	# archive_info = {}
	# archive_info["type"] = family_track.find("td", class_="ctype-1").string
	# archive_info["class"] = family_track.find("td", class_="cclass").string
	# archive_info["id"] = family_track_id
	# track_info["archiveInfo"] = archive_info
	# break
	#
	# print(f"family_track_id: {family_track_id}")
	# #print(f"track id: {family_track.find_all_next('td')[1]}")
	# #print(f"family_track.contents: {family_track.contents}")
	# ==============================

	with open(f"track_info_{track_id}.json", "w+") as f:
	json.dump(track_info, f, indent=2)

	if __name__ == "__main__":
	main()