ZoomTen · September 1, 2024 12:24
diff --git a/01_sfcw-archiver.py b/01_sfcw-archiver.py
 from os import makedirs
 from typing import Any, Generator, cast
 import requests
 import json
 import sys
 from contextlib import contextmanager
 from pathlib import Path

 level: int = 0

 BASE_URL: str = "https://sonicfanchara.wikia.com"


 def sanitize_file_name(s: str) -> str:
    return s.replace("/", "__").replace(" ", "_")


 def indicate(s: Any) -> None:
    global level
    s_type = type(s)
    if s_type is str:
        text = s
    elif s_type is dict:
        text = ""
        for k, v in s.items():
            text += k.__repr__() + " = "
            text += v.__repr__()
            text += "\n" + ("  " * level)
        text = text.strip()
    else:
        text = s.__repr__()
    print("  " * level + text)


 @contextmanager
 def raise_level(reason: str) -> Generator[None, Any, Any]:
    global level
    indicate(">" + reason)
    level += 1
    yield None
    level -= 1


 def process_line(line: str, output_directory: Path) -> None:
    makedirs(output_directory, exist_ok=True)
    if line.lower().startswith("file:"):
        file_name = line[len("file:") :]
        with raise_level("file: " + file_name):
            with raise_level("fetching file info from wikia API"):
                req_params = {
                    "controller": "Lightbox",
                    "method": "getMediaDetail",
                    "fileTitle": "File:" + file_name,
                }
                indicate(req_params)
                resp = requests.get(
                    BASE_URL + "/wikia.php",
                    params=req_params,
                )
            with raise_level("processing JSON body"):
                json_result = json.loads(resp.content.decode("utf-8"))
                img_url = json_result["rawImageUrl"]
                indicate("url of image = " + img_url)
                indicate("using latest revision")
            with raise_level("saving file"), open(
                output_directory / file_name, "wb"
            ) as saved_imgfile:
                img_req = requests.get(img_url)
                saved_imgfile.write(img_req.content)
                indicate("success")
    else:
        name = line
        with raise_level("article: " + name):
            with raise_level("fetching raw article content"):
                req_params = {
                    "action": "raw",
                }
                indicate(req_params)
                resp = requests.get(
                    BASE_URL + "/wiki/" + name,
                    params=req_params,
                )
            with raise_level("saving raw article content"), open(
                output_directory / (sanitize_file_name(name) + ".txt"), "wb"
            ) as saved_raw:
                saved_raw.write(resp.content)
                indicate("success")

            if name.lower().startswith("user blog:"):
                name = name[len("user blog:") :]
                namespace = "500"
            elif name.lower().startswith("user:"):
                name = name[len("user:") :]
                namespace = "2"
            else:
                name = name
                namespace = "0"
            with raise_level("fetching comments"):
                json_resp: dict[Any, Any] = {}
                num_pages: int = 0
                pages: dict[int, bytes] = {}

                # get first page
                req_params = {
                    "controller": "ArticleCommentsController",
                    "method": "getComments",
                    "title": name,
                    "namespace": namespace,
                    "hideDeleted": "false",
                }
                resp = requests.get(
                    BASE_URL + "/wikia.php",
                    params=req_params,
                )

                # First page of content
                pages[0] = resp.content

                # Follow "next" links, if any
                with raise_level("checking for multipage comments"):
                    json_resp = json.loads(resp.content.decode("utf-8"))
                    next_link = None
                    contents_of_links_key = json_resp.get("links", {})
                    if type(contents_of_links_key) == dict:
                        # Sometimes, Python thinks this is a list
                        # instead of a dict. :(
                        next_link = cast(
                            str | None,
                            cast(
                                dict[str, str], json_resp.get("links", {})
                            ).get("next", None),
                        )
                    if next_link == None:
                        indicate("nope, just a single page")
                    else:
                        indicate("yes, comments are multi-page")

                if next_link != None:
                    while next_link != None:
                        num_pages += 1
                        with raise_level("fetching page %d" % num_pages):
                            resp = requests.get(cast(str, next_link))
                            pages[num_pages] = resp.content
                            indicate("success")
                        json_resp = json.loads(resp.content.decode("utf-8"))
                        next_link = cast(
                            str | None,
                            cast(
                                dict[str, str], json_resp.get("links", {})
                            ).get("next", None),
                        )

            if num_pages < 1:
                with raise_level("saving article comments"), open(
                    output_directory
                    / (sanitize_file_name(name) + ".comments.json"),
                    "wb",
                ) as saved_raw:
                    current_page_json = json.loads(pages[0].decode("utf-8"))
                    saved_raw.write(
                        json.dumps(current_page_json, indent=4).encode("utf-8")
                    )
                    indicate("success")
            else:
                for page_num in range(0, num_pages + 1):
                    with raise_level(
                        "saving article comments (page %d)" % page_num
                    ), open(
                        output_directory
                        / (
                            sanitize_file_name(name)
                            + ".comments.%03d.json" % page_num
                        ),
                        "wb",
                    ) as saved_raw:
                        current_page_json = json.loads(
                            pages[page_num].decode("utf-8")
                        )
                        saved_raw.write(
                            json.dumps(current_page_json, indent=4).encode(
                                "utf-8"
                            )
                        )
                        indicate("success")


 if __name__ == "__main__":
    if len(sys.argv) < 3:
        print(sys.argv[0] + " list.txt" + " output_dir/")
        quit(0)
    with open(sys.argv[1], "r") as text_file:
        for l in text_file.readlines():
            process_line(l.strip(), Path(sys.argv[2]))
diff --git a/02_list.txt b/02_list.txt
 Sam the rabbit
 Cartoon comics
 Dash the Turtle
 User blog:Sam_the_Rabbit/The end of Cartoon_Comics
 File:Clishe_pose_ftw.jpg
	from os import makedirs
	from typing import Any, Generator, cast
	import requests
	import json
	import sys
	from contextlib import contextmanager
	from pathlib import Path

	level: int = 0

	BASE_URL: str = "https://sonicfanchara.wikia.com"


	def sanitize_file_name(s: str) -> str:
	return s.replace("/", "__").replace(" ", "_")


	def indicate(s: Any) -> None:
	global level
	s_type = type(s)
	if s_type is str:
	text = s
	elif s_type is dict:
	text = ""
	for k, v in s.items():
	text += k.__repr__() + " = "
	text += v.__repr__()
	text += "\n" + (" " * level)
	text = text.strip()
	else:
	text = s.__repr__()
	print(" " * level + text)


	@contextmanager
	def raise_level(reason: str) -> Generator[None, Any, Any]:
	global level
	indicate(">" + reason)
	level += 1
	yield None
	level -= 1


	def process_line(line: str, output_directory: Path) -> None:
	makedirs(output_directory, exist_ok=True)
	if line.lower().startswith("file:"):
	file_name = line[len("file:") :]
	with raise_level("file: " + file_name):
	with raise_level("fetching file info from wikia API"):
	req_params = {
	"controller": "Lightbox",
	"method": "getMediaDetail",
	"fileTitle": "File:" + file_name,
	}
	indicate(req_params)
	resp = requests.get(
	BASE_URL + "/wikia.php",
	params=req_params,
	)
	with raise_level("processing JSON body"):
	json_result = json.loads(resp.content.decode("utf-8"))
	img_url = json_result["rawImageUrl"]
	indicate("url of image = " + img_url)
	indicate("using latest revision")
	with raise_level("saving file"), open(
	output_directory / file_name, "wb"
	) as saved_imgfile:
	img_req = requests.get(img_url)
	saved_imgfile.write(img_req.content)
	indicate("success")
	else:
	name = line
	with raise_level("article: " + name):
	with raise_level("fetching raw article content"):
	req_params = {
	"action": "raw",
	}
	indicate(req_params)
	resp = requests.get(
	BASE_URL + "/wiki/" + name,
	params=req_params,
	)
	with raise_level("saving raw article content"), open(
	output_directory / (sanitize_file_name(name) + ".txt"), "wb"
	) as saved_raw:
	saved_raw.write(resp.content)
	indicate("success")

	if name.lower().startswith("user blog:"):
	name = name[len("user blog:") :]
	namespace = "500"
	elif name.lower().startswith("user:"):
	name = name[len("user:") :]
	namespace = "2"
	else:
	name = name
	namespace = "0"
	with raise_level("fetching comments"):
	json_resp: dict[Any, Any] = {}
	num_pages: int = 0
	pages: dict[int, bytes] = {}

	# get first page
	req_params = {
	"controller": "ArticleCommentsController",
	"method": "getComments",
	"title": name,
	"namespace": namespace,
	"hideDeleted": "false",
	}
	resp = requests.get(
	BASE_URL + "/wikia.php",
	params=req_params,
	)

	# First page of content
	pages[0] = resp.content

	# Follow "next" links, if any
	with raise_level("checking for multipage comments"):
	json_resp = json.loads(resp.content.decode("utf-8"))
	next_link = None
	contents_of_links_key = json_resp.get("links", {})
	if type(contents_of_links_key) == dict:
	# Sometimes, Python thinks this is a list
	# instead of a dict. :(
	next_link = cast(
	str \| None,
	cast(
	dict[str, str], json_resp.get("links", {})
	).get("next", None),
	)
	if next_link == None:
	indicate("nope, just a single page")
	else:
	indicate("yes, comments are multi-page")

	if next_link != None:
	while next_link != None:
	num_pages += 1
	with raise_level("fetching page %d" % num_pages):
	resp = requests.get(cast(str, next_link))
	pages[num_pages] = resp.content
	indicate("success")
	json_resp = json.loads(resp.content.decode("utf-8"))
	next_link = cast(
	str \| None,
	cast(
	dict[str, str], json_resp.get("links", {})
	).get("next", None),
	)

	if num_pages < 1:
	with raise_level("saving article comments"), open(
	output_directory
	/ (sanitize_file_name(name) + ".comments.json"),
	"wb",
	) as saved_raw:
	current_page_json = json.loads(pages[0].decode("utf-8"))
	saved_raw.write(
	json.dumps(current_page_json, indent=4).encode("utf-8")
	)
	indicate("success")
	else:
	for page_num in range(0, num_pages + 1):
	with raise_level(
	"saving article comments (page %d)" % page_num
	), open(
	output_directory
	/ (
	sanitize_file_name(name)
	+ ".comments.%03d.json" % page_num
	),
	"wb",
	) as saved_raw:
	current_page_json = json.loads(
	pages[page_num].decode("utf-8")
	)
	saved_raw.write(
	json.dumps(current_page_json, indent=4).encode(
	"utf-8"
	)
	)
	indicate("success")


	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print(sys.argv[0] + " list.txt" + " output_dir/")
	quit(0)
	with open(sys.argv[1], "r") as text_file:
	for l in text_file.readlines():
	process_line(l.strip(), Path(sys.argv[2]))
	Sam the rabbit
	Cartoon comics
	Dash the Turtle
	User blog:Sam_the_Rabbit/The end of Cartoon_Comics
	File:Clishe_pose_ftw.jpg