linkviii · November 23, 2024 23:36
diff --git a/soup.py b/soup.py
 #
 # %%
 from bs4 import BeautifulSoup
 from pathlib import Path
 import operator
 import itertools
 import sys
 import os

 # %%
 # Ensure that `>` redirection works on windows
 #  export PYTHONIOENCODING=utf-8
 # os.environ["PYTHONIOENCODING"] = "utf-8"
 # %%


 # Used ctrl s from the browser -- not even scraping
 # fname = "./Matsui Youhei - Person (30445) - AniDB.htm"
 # fname = "Ishikawa Tomohisa - Person (9097) - AniDB.htm"
 # fname = "Fujimura Tooru - Person (42411) - AniDB.htm"
 # fname = "ZAQ - Person (32347) - AniDB.htm"
 fname = "Tom-H@ck - Person (2431) - AniDB.htm"

 soup = BeautifulSoup(Path(fname).read_text(encoding="UTF-8"), "html.parser")

 # %%
 library = {}
 # %%
 library = {}
 credit_kinds = set()
 production_pane = soup.find("div", class_="music_production")
 songs = production_pane.find("tbody").find_all("tr")
 # songs = list(songs.children)
 # len(list(songs.children))
 for song in songs:
    tmp = song.find(class_="song")
    if tmp is None:
        continue
    name_tag = tmp.find("a")
    song_name = name_tag.text.strip()
    song_link = name_tag["href"]
    credit = song.find(class_="credit").text.strip()
    if song_name not in library:
        library[song_name] = {"song": song_name, "link": song_link}
    library[song_name][credit] = True
    credit_kinds.add(credit)

 # %%
 anime_pane = soup.find("div", class_="anime_songs")
 if anime_pane is None:
    anime_list = []
 else:    
    anime_list = anime_pane.find("tbody").find_all("tr")

 anime_name = None

 for anime in anime_list:
    song_name = anime.find(class_="name song").text.strip()
    try_name = anime.find(class_="name anime")
    if try_name:
        anime_name = try_name.text.strip()

    library[song_name]["anime"] = anime_name

 # %%
 # If a song were in the anime pane but not the production pane
 missing_credit = []
 not_anime = []

 for song, attr in library.items():
    if "anime" not in attr:
        not_anime.append((song, attr))
    if not any([key in credit_kinds for key in attr.keys()]):
        missing_credit.append((song, attr))

 # %%
 library_by_credit = {credit: {} for credit in credit_kinds}
 for song, attr in library.items():
    for credit in credit_kinds:
        if credit in attr:
            library_by_credit[credit][song] = attr

 # %%

 library_by_credit_by_anime = {credit: {} for credit in credit_kinds}
 for kind in credit_kinds:
    collection = library_by_credit[kind]
    anime_bag = library_by_credit_by_anime[kind]
    for attr in collection.values():
        anime_name = attr.get("anime")
        if anime_name not in anime_bag:
            anime_bag[anime_name] = []
        anime_bag[anime_name].append(attr)


 # %%
 max_name_len = max((len(it) for it in library.keys()))

 # %%
 print(f"{fname}")
 print("="*80)
 for kind in credit_kinds:
    this_credit_by_anime = library_by_credit_by_anime[kind]
    print(kind)
    print("=" * len(kind))

    if None in this_credit_by_anime:
        print("Not for anime:")
        song_list = this_credit_by_anime[None]
        for attr in song_list:
            print(f"\t{attr['song']:{max_name_len}} {attr['link']}")
        print()

    names = sorted(filter(None, this_credit_by_anime.keys()))
    for anime_name in names:
        song_list = this_credit_by_anime[anime_name]
        print(f"{anime_name}:")
        for attr in song_list:
            print(f"\t{attr['song']:{max_name_len}} {attr['link']}")
        print()

    print()


 # %%
 # getter = lambda key: lambda obj: obj[key] if key in obj else None
 # list(itertools.groupby(library.values(),key=getter("anime")))
	#
	# %%
	from bs4 import BeautifulSoup
	from pathlib import Path
	import operator
	import itertools
	import sys
	import os

	# %%
	# Ensure that `>` redirection works on windows
	# export PYTHONIOENCODING=utf-8
	# os.environ["PYTHONIOENCODING"] = "utf-8"
	# %%


	# Used ctrl s from the browser -- not even scraping
	# fname = "./Matsui Youhei - Person (30445) - AniDB.htm"
	# fname = "Ishikawa Tomohisa - Person (9097) - AniDB.htm"
	# fname = "Fujimura Tooru - Person (42411) - AniDB.htm"
	# fname = "ZAQ - Person (32347) - AniDB.htm"
	fname = "Tom-H@ck - Person (2431) - AniDB.htm"

	soup = BeautifulSoup(Path(fname).read_text(encoding="UTF-8"), "html.parser")

	# %%
	library = {}
	# %%
	library = {}
	credit_kinds = set()
	production_pane = soup.find("div", class_="music_production")
	songs = production_pane.find("tbody").find_all("tr")
	# songs = list(songs.children)
	# len(list(songs.children))
	for song in songs:
	tmp = song.find(class_="song")
	if tmp is None:
	continue
	name_tag = tmp.find("a")
	song_name = name_tag.text.strip()
	song_link = name_tag["href"]
	credit = song.find(class_="credit").text.strip()
	if song_name not in library:
	library[song_name] = {"song": song_name, "link": song_link}
	library[song_name][credit] = True
	credit_kinds.add(credit)

	# %%
	anime_pane = soup.find("div", class_="anime_songs")
	if anime_pane is None:
	anime_list = []
	else:
	anime_list = anime_pane.find("tbody").find_all("tr")

	anime_name = None

	for anime in anime_list:
	song_name = anime.find(class_="name song").text.strip()
	try_name = anime.find(class_="name anime")
	if try_name:
	anime_name = try_name.text.strip()

	library[song_name]["anime"] = anime_name

	# %%
	# If a song were in the anime pane but not the production pane
	missing_credit = []
	not_anime = []

	for song, attr in library.items():
	if "anime" not in attr:
	not_anime.append((song, attr))
	if not any([key in credit_kinds for key in attr.keys()]):
	missing_credit.append((song, attr))

	# %%
	library_by_credit = {credit: {} for credit in credit_kinds}
	for song, attr in library.items():
	for credit in credit_kinds:
	if credit in attr:
	library_by_credit[credit][song] = attr

	# %%

	library_by_credit_by_anime = {credit: {} for credit in credit_kinds}
	for kind in credit_kinds:
	collection = library_by_credit[kind]
	anime_bag = library_by_credit_by_anime[kind]
	for attr in collection.values():
	anime_name = attr.get("anime")
	if anime_name not in anime_bag:
	anime_bag[anime_name] = []
	anime_bag[anime_name].append(attr)


	# %%
	max_name_len = max((len(it) for it in library.keys()))

	# %%
	print(f"{fname}")
	print("="*80)
	for kind in credit_kinds:
	this_credit_by_anime = library_by_credit_by_anime[kind]
	print(kind)
	print("=" * len(kind))

	if None in this_credit_by_anime:
	print("Not for anime:")
	song_list = this_credit_by_anime[None]
	for attr in song_list:
	print(f"\t{attr['song']:{max_name_len}} {attr['link']}")
	print()

	names = sorted(filter(None, this_credit_by_anime.keys()))
	for anime_name in names:
	song_list = this_credit_by_anime[anime_name]
	print(f"{anime_name}:")
	for attr in song_list:
	print(f"\t{attr['song']:{max_name_len}} {attr['link']}")
	print()

	print()


	# %%
	# getter = lambda key: lambda obj: obj[key] if key in obj else None
	# list(itertools.groupby(library.values(),key=getter("anime")))