Created
March 30, 2024 22:57
-
-
Save Terrance/0bd495d630803454165c27aa9802778d to your computer and use it in GitHub Desktop.
Script to download all discussions and comments from a Vanilla forum as a set of JSON representation files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from itertools import count | |
from pathlib import Path | |
import os | |
import sys | |
import requests | |
class End(Exception): | |
pass | |
def main(host, start=1, update=True): | |
sess = requests.Session() | |
for d in count(start): | |
group = f"{d // 100 * 100}" | |
os.makedirs(group, exist_ok=True) | |
try: | |
for p in count(1): | |
url = f"{host}/en/discussion/{d}/-/p{p}.json" | |
print(url, end=" ") | |
sys.stdout.flush() | |
if Path(f"{group}/{d}.skip").exists(): | |
print("skip") | |
raise End | |
if Path(f"{group}/{d}.{p}.json").exists(): | |
if Path(f"{group}/{d}.{p + 1}.json").exists(): | |
print("done") | |
continue | |
elif not update: | |
print("done") | |
raise End | |
resp = sess.get(url) | |
if resp.status_code != 200: | |
print(resp.status_code) | |
Path(f"{group}/{d}.skip").touch() | |
raise End | |
if not resp.headers["Content-Type"].startswith("application/json"): | |
print("???") | |
with open(f"{group}/{d}.{p}.html", "wb") as fp: | |
fp.write(resp.content) | |
Path(f"{group}/{d}.skip").touch() | |
raise End | |
print("ok") | |
with open(f"{group}/{d}.{p}.json", "wb") as fp: | |
fp.write(resp.content) | |
try: | |
if len(resp.json()["Comments"]) < 30: | |
raise End | |
except End: | |
raise | |
except Exception as ex: | |
print(ex) | |
raise End | |
except End: | |
pass | |
if __name__ == "__main__": | |
from argparse import ArgumentParser | |
parser = ArgumentParser() | |
parser.add_argument("host") | |
parser.add_argument("-s", "--start", type=int, default=1) | |
parser.add_argument("-u", "--update", action="store_true") | |
args = parser.parse_args() | |
main(args.host, args.start, args.update) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment