Last active
May 30, 2021 18:28
-
-
Save dfm/57cb38d2805f0ce9de9467f3620b9df4 to your computer and use it in GitHub Desktop.
A script to download a mirror copy of all of the generated docs for a ReadTheDocs project. httrack must be installed for this to work and you'll need to provide an API token for RTDs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os | |
import re | |
import time | |
import json | |
import argparse | |
import requests | |
import tempfile | |
from pathlib import Path | |
from shutil import copytree | |
from subprocess import check_call | |
start = "<!-- RTD Extra Head -->" | |
end = "<!-- end RTD <extrahead> -->" | |
RTDS_EXTRA = re.compile(f"{start}.*{end}", re.S | re.M) | |
def get_versions(url, token=None): | |
if token is None: | |
token = os.environ["RTDS_API_TOKEN"] | |
if not token: | |
raise RuntimeError( | |
"A RTDs API token must be provided using the 'RTDS_API_TOKEN' " | |
"environment variable or the '--token' command line argument" | |
) | |
params = dict(active=True, built=True, limit=100) | |
headers = {"Authorization": f"Token {token}"} | |
r = requests.get(url, params=params, headers=headers) | |
r.raise_for_status() | |
data = r.json() | |
link = data.get("next", None) | |
if link: | |
return data.get("results", []) + get_versions(link) | |
return data.get("results", []) | |
def remove_rtds_extrahead(filename): | |
with open(filename, "r") as f: | |
txt = RTDS_EXTRA.sub("", f.read()) | |
with open(filename, "w") as f: | |
f.write(txt) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("project", help="The name of the RTDs project") | |
parser.add_argument("--token", help="Your token for the RTDs API") | |
parser.add_argument( | |
"--sleep", default=20, type=int, | |
help="The time (in seconds) to wait between versions" | |
) | |
args = parser.parse_args() | |
target = Path("mirror") | |
versions = get_versions( | |
f"https://readthedocs.org/api/v3/projects/{args.project}/versions/", | |
token=args.token, | |
) | |
database = {"versions": [], "aliases": {}} | |
for version in versions: | |
slug = version["slug"] | |
path = target / slug | |
if slug not in ["latest", "stable"]: | |
database["versions"].append( | |
{ | |
"ref": f"refs/tags/{slug}", | |
"version": slug, | |
"name": slug, | |
"path": slug, | |
"active": True, | |
} | |
) | |
if path.exists(): | |
print(f"Skipping {slug}") | |
continue | |
with tempfile.TemporaryDirectory() as dir: | |
url = version["urls"]["documentation"] | |
check_call([ | |
"httrack", | |
f'"{version["urls"]["documentation"]}"', | |
"-O", | |
f'"{dir}"', | |
"-%v", | |
]) | |
copytree(Path(dir) / url.split("//")[1], path) | |
print("Removing RTDs extra headers") | |
for filename in path.rglob("*.html"): | |
print(f"Cleaning {filename}") | |
remove_rtds_extrahead(filename) | |
print(f"Mirrored {slug}, waiting...") | |
time.sleep(args.sleep) | |
with open(target / "unladen.json", "w") as f: | |
json.dump(database, f, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment