Created
September 1, 2024 12:24
-
-
Save ZoomTen/ae98590368d745c3a856a30d15784c61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from os import makedirs | |
from typing import Any, Generator, cast | |
import requests | |
import json | |
import sys | |
from contextlib import contextmanager | |
from pathlib import Path | |
level: int = 0 | |
BASE_URL: str = "https://sonicfanchara.wikia.com" | |
def sanitize_file_name(s: str) -> str: | |
return s.replace("/", "__").replace(" ", "_") | |
def indicate(s: Any) -> None: | |
global level | |
s_type = type(s) | |
if s_type is str: | |
text = s | |
elif s_type is dict: | |
text = "" | |
for k, v in s.items(): | |
text += k.__repr__() + " = " | |
text += v.__repr__() | |
text += "\n" + (" " * level) | |
text = text.strip() | |
else: | |
text = s.__repr__() | |
print(" " * level + text) | |
@contextmanager | |
def raise_level(reason: str) -> Generator[None, Any, Any]: | |
global level | |
indicate(">" + reason) | |
level += 1 | |
yield None | |
level -= 1 | |
def process_line(line: str, output_directory: Path) -> None: | |
makedirs(output_directory, exist_ok=True) | |
if line.lower().startswith("file:"): | |
file_name = line[len("file:") :] | |
with raise_level("file: " + file_name): | |
with raise_level("fetching file info from wikia API"): | |
req_params = { | |
"controller": "Lightbox", | |
"method": "getMediaDetail", | |
"fileTitle": "File:" + file_name, | |
} | |
indicate(req_params) | |
resp = requests.get( | |
BASE_URL + "/wikia.php", | |
params=req_params, | |
) | |
with raise_level("processing JSON body"): | |
json_result = json.loads(resp.content.decode("utf-8")) | |
img_url = json_result["rawImageUrl"] | |
indicate("url of image = " + img_url) | |
indicate("using latest revision") | |
with raise_level("saving file"), open( | |
output_directory / file_name, "wb" | |
) as saved_imgfile: | |
img_req = requests.get(img_url) | |
saved_imgfile.write(img_req.content) | |
indicate("success") | |
else: | |
name = line | |
with raise_level("article: " + name): | |
with raise_level("fetching raw article content"): | |
req_params = { | |
"action": "raw", | |
} | |
indicate(req_params) | |
resp = requests.get( | |
BASE_URL + "/wiki/" + name, | |
params=req_params, | |
) | |
with raise_level("saving raw article content"), open( | |
output_directory / (sanitize_file_name(name) + ".txt"), "wb" | |
) as saved_raw: | |
saved_raw.write(resp.content) | |
indicate("success") | |
if name.lower().startswith("user blog:"): | |
name = name[len("user blog:") :] | |
namespace = "500" | |
elif name.lower().startswith("user:"): | |
name = name[len("user:") :] | |
namespace = "2" | |
else: | |
name = name | |
namespace = "0" | |
with raise_level("fetching comments"): | |
json_resp: dict[Any, Any] = {} | |
num_pages: int = 0 | |
pages: dict[int, bytes] = {} | |
# get first page | |
req_params = { | |
"controller": "ArticleCommentsController", | |
"method": "getComments", | |
"title": name, | |
"namespace": namespace, | |
"hideDeleted": "false", | |
} | |
resp = requests.get( | |
BASE_URL + "/wikia.php", | |
params=req_params, | |
) | |
# First page of content | |
pages[0] = resp.content | |
# Follow "next" links, if any | |
with raise_level("checking for multipage comments"): | |
json_resp = json.loads(resp.content.decode("utf-8")) | |
next_link = None | |
contents_of_links_key = json_resp.get("links", {}) | |
if type(contents_of_links_key) == dict: | |
# Sometimes, Python thinks this is a list | |
# instead of a dict. :( | |
next_link = cast( | |
str | None, | |
cast( | |
dict[str, str], json_resp.get("links", {}) | |
).get("next", None), | |
) | |
if next_link == None: | |
indicate("nope, just a single page") | |
else: | |
indicate("yes, comments are multi-page") | |
if next_link != None: | |
while next_link != None: | |
num_pages += 1 | |
with raise_level("fetching page %d" % num_pages): | |
resp = requests.get(cast(str, next_link)) | |
pages[num_pages] = resp.content | |
indicate("success") | |
json_resp = json.loads(resp.content.decode("utf-8")) | |
next_link = cast( | |
str | None, | |
cast( | |
dict[str, str], json_resp.get("links", {}) | |
).get("next", None), | |
) | |
if num_pages < 1: | |
with raise_level("saving article comments"), open( | |
output_directory | |
/ (sanitize_file_name(name) + ".comments.json"), | |
"wb", | |
) as saved_raw: | |
current_page_json = json.loads(pages[0].decode("utf-8")) | |
saved_raw.write( | |
json.dumps(current_page_json, indent=4).encode("utf-8") | |
) | |
indicate("success") | |
else: | |
for page_num in range(0, num_pages + 1): | |
with raise_level( | |
"saving article comments (page %d)" % page_num | |
), open( | |
output_directory | |
/ ( | |
sanitize_file_name(name) | |
+ ".comments.%03d.json" % page_num | |
), | |
"wb", | |
) as saved_raw: | |
current_page_json = json.loads( | |
pages[page_num].decode("utf-8") | |
) | |
saved_raw.write( | |
json.dumps(current_page_json, indent=4).encode( | |
"utf-8" | |
) | |
) | |
indicate("success") | |
if __name__ == "__main__": | |
if len(sys.argv) < 3: | |
print(sys.argv[0] + " list.txt" + " output_dir/") | |
quit(0) | |
with open(sys.argv[1], "r") as text_file: | |
for l in text_file.readlines(): | |
process_line(l.strip(), Path(sys.argv[2])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sam the rabbit | |
Cartoon comics | |
Dash the Turtle | |
User blog:Sam_the_Rabbit/The end of Cartoon_Comics | |
File:Clishe_pose_ftw.jpg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment