Skip to content

Instantly share code, notes, and snippets.

@ZoomTen
Created September 1, 2024 12:24
Show Gist options
  • Save ZoomTen/ae98590368d745c3a856a30d15784c61 to your computer and use it in GitHub Desktop.
Save ZoomTen/ae98590368d745c3a856a30d15784c61 to your computer and use it in GitHub Desktop.
from os import makedirs
from typing import Any, Generator, cast
import requests
import json
import sys
from contextlib import contextmanager
from pathlib import Path
level: int = 0
BASE_URL: str = "https://sonicfanchara.wikia.com"
def sanitize_file_name(s: str) -> str:
return s.replace("/", "__").replace(" ", "_")
def indicate(s: Any) -> None:
global level
s_type = type(s)
if s_type is str:
text = s
elif s_type is dict:
text = ""
for k, v in s.items():
text += k.__repr__() + " = "
text += v.__repr__()
text += "\n" + (" " * level)
text = text.strip()
else:
text = s.__repr__()
print(" " * level + text)
@contextmanager
def raise_level(reason: str) -> Generator[None, Any, Any]:
global level
indicate(">" + reason)
level += 1
yield None
level -= 1
def process_line(line: str, output_directory: Path) -> None:
makedirs(output_directory, exist_ok=True)
if line.lower().startswith("file:"):
file_name = line[len("file:") :]
with raise_level("file: " + file_name):
with raise_level("fetching file info from wikia API"):
req_params = {
"controller": "Lightbox",
"method": "getMediaDetail",
"fileTitle": "File:" + file_name,
}
indicate(req_params)
resp = requests.get(
BASE_URL + "/wikia.php",
params=req_params,
)
with raise_level("processing JSON body"):
json_result = json.loads(resp.content.decode("utf-8"))
img_url = json_result["rawImageUrl"]
indicate("url of image = " + img_url)
indicate("using latest revision")
with raise_level("saving file"), open(
output_directory / file_name, "wb"
) as saved_imgfile:
img_req = requests.get(img_url)
saved_imgfile.write(img_req.content)
indicate("success")
else:
name = line
with raise_level("article: " + name):
with raise_level("fetching raw article content"):
req_params = {
"action": "raw",
}
indicate(req_params)
resp = requests.get(
BASE_URL + "/wiki/" + name,
params=req_params,
)
with raise_level("saving raw article content"), open(
output_directory / (sanitize_file_name(name) + ".txt"), "wb"
) as saved_raw:
saved_raw.write(resp.content)
indicate("success")
if name.lower().startswith("user blog:"):
name = name[len("user blog:") :]
namespace = "500"
elif name.lower().startswith("user:"):
name = name[len("user:") :]
namespace = "2"
else:
name = name
namespace = "0"
with raise_level("fetching comments"):
json_resp: dict[Any, Any] = {}
num_pages: int = 0
pages: dict[int, bytes] = {}
# get first page
req_params = {
"controller": "ArticleCommentsController",
"method": "getComments",
"title": name,
"namespace": namespace,
"hideDeleted": "false",
}
resp = requests.get(
BASE_URL + "/wikia.php",
params=req_params,
)
# First page of content
pages[0] = resp.content
# Follow "next" links, if any
with raise_level("checking for multipage comments"):
json_resp = json.loads(resp.content.decode("utf-8"))
next_link = None
contents_of_links_key = json_resp.get("links", {})
if type(contents_of_links_key) == dict:
# Sometimes, Python thinks this is a list
# instead of a dict. :(
next_link = cast(
str | None,
cast(
dict[str, str], json_resp.get("links", {})
).get("next", None),
)
if next_link == None:
indicate("nope, just a single page")
else:
indicate("yes, comments are multi-page")
if next_link != None:
while next_link != None:
num_pages += 1
with raise_level("fetching page %d" % num_pages):
resp = requests.get(cast(str, next_link))
pages[num_pages] = resp.content
indicate("success")
json_resp = json.loads(resp.content.decode("utf-8"))
next_link = cast(
str | None,
cast(
dict[str, str], json_resp.get("links", {})
).get("next", None),
)
if num_pages < 1:
with raise_level("saving article comments"), open(
output_directory
/ (sanitize_file_name(name) + ".comments.json"),
"wb",
) as saved_raw:
current_page_json = json.loads(pages[0].decode("utf-8"))
saved_raw.write(
json.dumps(current_page_json, indent=4).encode("utf-8")
)
indicate("success")
else:
for page_num in range(0, num_pages + 1):
with raise_level(
"saving article comments (page %d)" % page_num
), open(
output_directory
/ (
sanitize_file_name(name)
+ ".comments.%03d.json" % page_num
),
"wb",
) as saved_raw:
current_page_json = json.loads(
pages[page_num].decode("utf-8")
)
saved_raw.write(
json.dumps(current_page_json, indent=4).encode(
"utf-8"
)
)
indicate("success")
if __name__ == "__main__":
if len(sys.argv) < 3:
print(sys.argv[0] + " list.txt" + " output_dir/")
quit(0)
with open(sys.argv[1], "r") as text_file:
for l in text_file.readlines():
process_line(l.strip(), Path(sys.argv[2]))
Sam the rabbit
Cartoon comics
Dash the Turtle
User blog:Sam_the_Rabbit/The end of Cartoon_Comics
File:Clishe_pose_ftw.jpg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment