md-riaz · October 15, 2025 04:38
diff --git a/WaybackDownloader.php b/WaybackDownloader.php
 <?php
 /**
 * WaybackDownloader
 *
 * Single-file PHP class that:
 *  - Accepts a main URL or a Wayback snapshot URL
 *  - Auto-finds the latest snapshot if you supply a main URL
 *  - Queries CDX for a deduped list of 200-OK captures (up to maxPages)
 *  - Downloads HTML + assets, strips Wayback toolbar, rewrites links to local paths
 *  - Writes a static folder and a site.zip
 *
 * Requirements: PHP 8+, curl, ZipArchive, DOMDocument enabled
 *
 * ⚖️ Use responsibly. Only download content you have rights to. Respect IA Terms.
 */
 class WaybackDownloader
 {
    private string $outDir;
    private int $maxPages;
    private int $perRequestDelayMs; // politeness delay between requests
    private int $timeoutSeconds;
    private string $userAgent;

    public function __construct(
        string $outDir = __DIR__ . '/dist/site',
        int $maxPages = 150,
        int $perRequestDelayMs = 200,
        int $timeoutSeconds = 30,
        string $userAgent = 'WaybackDownloader/1.0 (+https://example.org)'
    ) {
        $this->outDir = rtrim($outDir, "/");
        $this->maxPages = $maxPages;
        $this->perRequestDelayMs = $perRequestDelayMs;
        $this->timeoutSeconds = $timeoutSeconds;
        $this->userAgent = $userAgent;
    }

    /**
     * @param string $input  Either a main URL (https://example.com) or a Wayback snapshot URL
     * @param bool   $latest When true and $input is a main URL, pick the newest snapshot
     * @return array{outDir:string, zipPath:string}
     * @throws RuntimeException
     */
    public function download(string $input, bool $latest = true): array
    {
        $this->ensureDir($this->outDir);

        if ($this->isWaybackUrl($input)) {
            $domain = $this->domainFromWayback($input);
            $timestamp = $this->timestampFromWayback($input);
        } else {
            $domain = $this->normalizeDomain($input);
            if ($latest) {
                $timestamp = $this->latestTimestampFor($domain);
            } else {
                throw new RuntimeException(
                    "A normal URL was provided but latest=false and no timestamp supplied. ".
                    "Pass a Wayback URL or set latest=true."
                );
            }
        }

        // 1) List URLs from CDX
        $urls = $this->collectUrls($domain, $timestamp, $this->maxPages);

        // 2) Download all
        $count = 0;
        foreach ($urls as $archivedUrl) {
            $count++;
            $this->fetchOne($archivedUrl, $domain);
            if ($this->perRequestDelayMs > 0) {
                usleep($this->perRequestDelayMs * 1000);
            }
            // Optional: print progress to console
            if ($count % 10 === 0) {
                fwrite(STDERR, "Downloaded: $count / " . count($urls) . PHP_EOL);
            }
        }

        // 3) Zip output
        $zipPath = $this->zipOutput();

        return [
            'outDir'  => $this->outDir,
            'zipPath' => $zipPath,
        ];
    }

    // --------------------- Core steps ---------------------

    private function collectUrls(string $domain, ?string $timestamp, int $limit): array
    {
        // CDX API (JSON)
        // - matchType=domain
        // - filter statuscode:200
        // - collapse by digest
        // - If timestamp provided, we scope to that day (YYYYMMDD)
        $params = [
            'url'       => $domain . '/*',
            'matchType' => 'domain',
            'output'    => 'json',
            'filter'    => 'statuscode:200',
            'collapse'  => 'digest',
            'limit'     => (string) $limit,
        ];
        if ($timestamp && strlen($timestamp) >= 8) {
            $day = substr($timestamp, 0, 8);
            $params['from'] = $day;
            $params['to']   = $day;
        }

        $cdxUrl = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params);
        $data = $this->httpJson($cdxUrl);
        if (!is_array($data) || count($data) < 2) {
            throw new RuntimeException("No CDX rows returned.");
        }

        $rows = array_slice($data, 1); // skip header
        $urls = [];
        foreach ($rows as $row) {
            // Expected fields: original row is cdx default header list; safer to read by index:
            // [0]=urlkey, [1]=timestamp, [2]=original, [3]=mimetype, [4]=statuscode, [5]=digest, [6]=length
            $ts = $row[1] ?? null;
            $orig = $row[2] ?? null;
            if (!$ts || !$orig) continue;
            $archived = "https://web.archive.org/web/{$ts}id_/{$orig}";
            $urls[] = $archived;
        }
        return $urls;
    }

    private function fetchOne(string $archivedUrl, string $domain): void
    {
        [$data, $contentType] = $this->httpGet($archivedUrl);
        if ($data === null) {
            fwrite(STDERR, "Skip (null) $archivedUrl\n");
            return;
        }

        $path = $this->localPathFor($archivedUrl);
        $this->ensureDir(dirname($path));

        if ($this->isHtml($contentType)) {
            $html = $this->stripWaybackToolbar($data);
            $html = $this->rewriteHtml($html, $domain);
            file_put_contents($path, $html);
        } else {
            file_put_contents($path, $data);
        }
    }

    private function zipOutput(): string
    {
        $zipPath = dirname($this->outDir) . '/site.zip';
        $zip = new ZipArchive();
        if ($zip->open($zipPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
            throw new RuntimeException("Unable to open zip: $zipPath");
        }
        $dirIt = new RecursiveDirectoryIterator($this->outDir, RecursiveDirectoryIterator::SKIP_DOTS);
        $it = new RecursiveIteratorIterator($dirIt, RecursiveIteratorIterator::SELF_FIRST);
        foreach ($it as $file) {
            /** @var SplFileInfo $file */
            $real = $file->getRealPath();
            if ($real === false) continue;
            $rel = substr($real, strlen($this->outDir) + 1);
            if ($file->isDir()) {
                $zip->addEmptyDir($rel);
            } else {
                $zip->addFile($real, $rel);
            }
        }
        $zip->close();
        return $zipPath;
    }

    // --------------------- Helpers ---------------------

    private function isWaybackUrl(string $url): bool
    {
        return str_starts_with($url, 'https://web.archive.org/web/')
            || str_starts_with($url, 'http://web.archive.org/web/');
    }

    private function domainFromWayback(string $snapshotUrl): string
    {
        $m = [];
        if (!preg_match('#/web/\d{8,14}/(.*)$#', $snapshotUrl, $m)) {
            throw new RuntimeException("Invalid Wayback snapshot URL");
        }
        $rest = $m[1];
        if (!str_contains($rest, '://')) {
            $rest = 'http://' . $rest;
        }
        $parts = parse_url($rest);
        return $parts['host'] ?? throw new RuntimeException("Cannot parse domain from snapshot URL");
    }

    private function timestampFromWayback(string $snapshotUrl): string
    {
        $m = [];
        if (!preg_match('#/web/(\d{8,14})/#', $snapshotUrl, $m)) {
            throw new RuntimeException("No timestamp found in snapshot URL");
        }
        return $m[1];
    }

    private function normalizeDomain(string $value): string
    {
        if (!str_contains($value, '://')) {
            $value = 'http://' . $value;
        }
        $parts = parse_url($value);
        return $parts['host'] ?? throw new RuntimeException("Invalid URL for domain");
    }

    private function latestTimestampFor(string $domain): string
    {
        $params = [
            'url'       => $domain . '/*',
            'matchType' => 'domain',
            'output'    => 'json',
            'filter'    => 'statuscode:200',
            'collapse'  => 'digest',
            'fl'        => 'timestamp,original',
            'limit'     => '1',
            'sort'      => 'desc', // newest first
        ];
        $url = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params);
        $data = $this->httpJson($url);
        if (!is_array($data) || count($data) < 2) {
            throw new RuntimeException("No captures found for $domain");
        }
        return $data[1][0]; // first row after header → timestamp
    }

    private function localPathFor(string $archivedUrl): string
    {
        // Map https://web.archive.org/web/<ts>id_/http://example.com/path
        // → outDir/path (append index.html for trailing slash)
        $m = [];
        if (!preg_match('#https?://web\.archive\.org/web/\d+id?_/((?:https?:)?//)?(.+)$#i', $archivedUrl, $m)) {
            // fallback: sanitize whole URL as filename
            $safe = preg_replace('/[^a-zA-Z0-9_.-]/', '_', $archivedUrl);
            return $this->outDir . '/' . $safe;
        }
        $hostAndPath = $m[2];

        if (str_starts_with($hostAndPath, 'http://') || str_starts_with($hostAndPath, 'https://')) {
            $parts = parse_url($hostAndPath);
            $rel = ltrim($parts['path'] ?? '', '/');
        } else {
            $slash = strpos($hostAndPath, '/');
            $rel = ($slash === false) ? 'index.html' : substr($hostAndPath, $slash + 1);
        }
        if ($rel === '' || str_ends_with($rel, '/')) {
            $rel .= 'index.html';
        }
        return $this->outDir . '/' . $rel;
    }

    private function isHtml(?string $contentType): bool
    {
        return $contentType !== null && stripos($contentType, 'text/html') !== false;
    }

    private function stripWaybackToolbar(string $html): string
    {
        // Remove toolbar comments and scripts
        $html = preg_replace(
            '#<!-- BEGIN WAYBACK TOOLBAR INSERT.*?END WAYBACK TOOLBAR INSERT -->#is',
            '',
            $html
        );
        $html = preg_replace(
            '#<script[^>]*src="https?://web\.archive\.org/web/[^"]+/static/js/wbhack\.js"[^>]*></script>#i',
            '',
            $html
        );
        return $html;
    }

    private function rewriteHtml(string $html, string $domain): string
    {
        // Use DOMDocument to adjust href/src; keep it tolerant
        $prev = libxml_use_internal_errors(true);
        $doc = new DOMDocument();
        $doc->preserveWhiteSpace = false;
        $doc->formatOutput = false;
        $encoding = mb_detect_encoding($html, ['UTF-8','ISO-8859-1','Windows-1252'], true) ?: 'UTF-8';
        if ($encoding !== 'UTF-8') {
            $html = mb_convert_encoding($html, 'HTML-ENTITIES', $encoding);
        }
        $doc->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);

        $xpath = new DOMXPath($doc);
        $nodes = $xpath->query('//@href | //@src');
        foreach ($nodes as $attr) {
            /** @var DOMAttr $attr */
            $v = $attr->value;
            if ($v === '') continue;

            // Strip Wayback prefix if present
            $v = preg_replace('#https?://web\.archive\.org/web/\d+id?_/+#i', '/', $v);

            // If it's a full URL to our domain, make it path-based
            if (preg_match('#^https?://#i', $v)) {
                $parts = parse_url($v);
                $host = $parts['host'] ?? '';
                if ($host && str_contains($host, $domain)) {
                    $v = $parts['path'] ?? '/';
                    if (!empty($parts['query'])) {
                        $v .= '?' . $parts['query'];
                    }
                }
            }

            // Very light normalization: ensure relative paths don’t escape root
            $attr->value = $v;
        }

        // Fix inline styles url(web.archive.org/...) → url(...)
        $styleNodes = $xpath->query('//@style');
        foreach ($styleNodes as $attr) {
            $attr->value = preg_replace(
                '#url\((https?://web\.archive\.org/web/\d+id?_/)+#i',
                'url(',
                $attr->value
            );
        }

        $out = $doc->saveHTML();
        libxml_use_internal_errors($prev);

        return $out;
    }

    // --------------------- HTTP ---------------------

    private function httpJson(string $url): mixed
    {
        [$data, $ct] = $this->httpGet($url);
        if ($data === null) {
            throw new RuntimeException("HTTP failed: $url");
        }
        $json = json_decode($data, true);
        if (json_last_error() !== JSON_ERROR_NONE) {
            throw new RuntimeException("Invalid JSON from $url: " . json_last_error_msg());
        }
        return $json;
    }

    /**
     * @return array{0:?string,1:?string} body, content-type
     */
    private function httpGet(string $url): array
    {
        $ch = curl_init($url);
        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_MAXREDIRS      => 5,
            CURLOPT_TIMEOUT        => $this->timeoutSeconds,
            CURLOPT_USERAGENT      => $this->userAgent,
            CURLOPT_HEADER         => true,
        ]);
        $resp = curl_exec($ch);
        if ($resp === false) {
            $err = curl_error($ch);
            curl_close($ch);
            fwrite(STDERR, "curl error: $err\n");
            return [null, null];
        }

        $status = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
        $hdrSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
        $headersRaw = substr($resp, 0, $hdrSize);
        $body = substr($resp, $hdrSize);
        curl_close($ch);

        if ($status < 200 || $status >= 400) {
            fwrite(STDERR, "HTTP $status: $url\n");
            return [null, null];
        }

        // Parse content-type
        $ct = null;
        foreach (explode("\r\n", $headersRaw) as $line) {
            if (stripos($line, 'Content-Type:') === 0) {
                $parts = explode(':', $line, 2);
                $ct = trim($parts[1] ?? '');
                break;
            }
        }
        return [$body, $ct];
    }

    // --------------------- FS ---------------------

    private function ensureDir(string $dir): void
    {
        if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
            throw new RuntimeException("Cannot create dir: $dir");
        }
    }
 }
	<?php
	/**
	* WaybackDownloader
	*
	* Single-file PHP class that:
	* - Accepts a main URL or a Wayback snapshot URL
	* - Auto-finds the latest snapshot if you supply a main URL
	* - Queries CDX for a deduped list of 200-OK captures (up to maxPages)
	* - Downloads HTML + assets, strips Wayback toolbar, rewrites links to local paths
	* - Writes a static folder and a site.zip
	*
	* Requirements: PHP 8+, curl, ZipArchive, DOMDocument enabled
	*
	* ⚖️ Use responsibly. Only download content you have rights to. Respect IA Terms.
	*/
	class WaybackDownloader
	{
	private string $outDir;
	private int $maxPages;
	private int $perRequestDelayMs; // politeness delay between requests
	private int $timeoutSeconds;
	private string $userAgent;

	public function __construct(
	string $outDir = __DIR__ . '/dist/site',
	int $maxPages = 150,
	int $perRequestDelayMs = 200,
	int $timeoutSeconds = 30,
	string $userAgent = 'WaybackDownloader/1.0 (+https://example.org)'
	) {
	$this->outDir = rtrim($outDir, "/");
	$this->maxPages = $maxPages;
	$this->perRequestDelayMs = $perRequestDelayMs;
	$this->timeoutSeconds = $timeoutSeconds;
	$this->userAgent = $userAgent;
	}

	/**
	* @param string $input Either a main URL (https://example.com) or a Wayback snapshot URL
	* @param bool $latest When true and $input is a main URL, pick the newest snapshot
	* @return array{outDir:string, zipPath:string}
	* @throws RuntimeException
	*/
	public function download(string $input, bool $latest = true): array
	{
	$this->ensureDir($this->outDir);

	if ($this->isWaybackUrl($input)) {
	$domain = $this->domainFromWayback($input);
	$timestamp = $this->timestampFromWayback($input);
	} else {
	$domain = $this->normalizeDomain($input);
	if ($latest) {
	$timestamp = $this->latestTimestampFor($domain);
	} else {
	throw new RuntimeException(
	"A normal URL was provided but latest=false and no timestamp supplied. ".
	"Pass a Wayback URL or set latest=true."
	);
	}
	}

	// 1) List URLs from CDX
	$urls = $this->collectUrls($domain, $timestamp, $this->maxPages);

	// 2) Download all
	$count = 0;
	foreach ($urls as $archivedUrl) {
	$count++;
	$this->fetchOne($archivedUrl, $domain);
	if ($this->perRequestDelayMs > 0) {
	usleep($this->perRequestDelayMs * 1000);
	}
	// Optional: print progress to console
	if ($count % 10 === 0) {
	fwrite(STDERR, "Downloaded: $count / " . count($urls) . PHP_EOL);
	}
	}

	// 3) Zip output
	$zipPath = $this->zipOutput();

	return [
	'outDir' => $this->outDir,
	'zipPath' => $zipPath,
	];
	}

	// --------------------- Core steps ---------------------

	private function collectUrls(string $domain, ?string $timestamp, int $limit): array
	{
	// CDX API (JSON)
	// - matchType=domain
	// - filter statuscode:200
	// - collapse by digest
	// - If timestamp provided, we scope to that day (YYYYMMDD)
	$params = [
	'url' => $domain . '/*',
	'matchType' => 'domain',
	'output' => 'json',
	'filter' => 'statuscode:200',
	'collapse' => 'digest',
	'limit' => (string) $limit,
	];
	if ($timestamp && strlen($timestamp) >= 8) {
	$day = substr($timestamp, 0, 8);
	$params['from'] = $day;
	$params['to'] = $day;
	}

	$cdxUrl = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params);
	$data = $this->httpJson($cdxUrl);
	if (!is_array($data) \|\| count($data) < 2) {
	throw new RuntimeException("No CDX rows returned.");
	}

	$rows = array_slice($data, 1); // skip header
	$urls = [];
	foreach ($rows as $row) {
	// Expected fields: original row is cdx default header list; safer to read by index:
	// [0]=urlkey, [1]=timestamp, [2]=original, [3]=mimetype, [4]=statuscode, [5]=digest, [6]=length
	$ts = $row[1] ?? null;
	$orig = $row[2] ?? null;
	if (!$ts \|\| !$orig) continue;
	$archived = "https://web.archive.org/web/{$ts}id_/{$orig}";
	$urls[] = $archived;
	}
	return $urls;
	}

	private function fetchOne(string $archivedUrl, string $domain): void
	{
	[$data, $contentType] = $this->httpGet($archivedUrl);
	if ($data === null) {
	fwrite(STDERR, "Skip (null) $archivedUrl\n");
	return;
	}

	$path = $this->localPathFor($archivedUrl);
	$this->ensureDir(dirname($path));

	if ($this->isHtml($contentType)) {
	$html = $this->stripWaybackToolbar($data);
	$html = $this->rewriteHtml($html, $domain);
	file_put_contents($path, $html);
	} else {
	file_put_contents($path, $data);
	}
	}

	private function zipOutput(): string
	{
	$zipPath = dirname($this->outDir) . '/site.zip';
	$zip = new ZipArchive();
	if ($zip->open($zipPath, ZipArchive::CREATE \| ZipArchive::OVERWRITE) !== true) {
	throw new RuntimeException("Unable to open zip: $zipPath");
	}
	$dirIt = new RecursiveDirectoryIterator($this->outDir, RecursiveDirectoryIterator::SKIP_DOTS);
	$it = new RecursiveIteratorIterator($dirIt, RecursiveIteratorIterator::SELF_FIRST);
	foreach ($it as $file) {
	/** @var SplFileInfo $file */
	$real = $file->getRealPath();
	if ($real === false) continue;
	$rel = substr($real, strlen($this->outDir) + 1);
	if ($file->isDir()) {
	$zip->addEmptyDir($rel);
	} else {
	$zip->addFile($real, $rel);
	}
	}
	$zip->close();
	return $zipPath;
	}

	// --------------------- Helpers ---------------------

	private function isWaybackUrl(string $url): bool
	{
	return str_starts_with($url, 'https://web.archive.org/web/')
	\|\| str_starts_with($url, 'http://web.archive.org/web/');
	}

	private function domainFromWayback(string $snapshotUrl): string
	{
	$m = [];
	if (!preg_match('#/web/\d{8,14}/(.*)$#', $snapshotUrl, $m)) {
	throw new RuntimeException("Invalid Wayback snapshot URL");
	}
	$rest = $m[1];
	if (!str_contains($rest, '://')) {
	$rest = 'http://' . $rest;
	}
	$parts = parse_url($rest);
	return $parts['host'] ?? throw new RuntimeException("Cannot parse domain from snapshot URL");
	}

	private function timestampFromWayback(string $snapshotUrl): string
	{
	$m = [];
	if (!preg_match('#/web/(\d{8,14})/#', $snapshotUrl, $m)) {
	throw new RuntimeException("No timestamp found in snapshot URL");
	}
	return $m[1];
	}

	private function normalizeDomain(string $value): string
	{
	if (!str_contains($value, '://')) {
	$value = 'http://' . $value;
	}
	$parts = parse_url($value);
	return $parts['host'] ?? throw new RuntimeException("Invalid URL for domain");
	}

	private function latestTimestampFor(string $domain): string
	{
	$params = [
	'url' => $domain . '/*',
	'matchType' => 'domain',
	'output' => 'json',
	'filter' => 'statuscode:200',
	'collapse' => 'digest',
	'fl' => 'timestamp,original',
	'limit' => '1',
	'sort' => 'desc', // newest first
	];
	$url = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params);
	$data = $this->httpJson($url);
	if (!is_array($data) \|\| count($data) < 2) {
	throw new RuntimeException("No captures found for $domain");
	}
	return $data[1][0]; // first row after header → timestamp
	}

	private function localPathFor(string $archivedUrl): string
	{
	// Map https://web.archive.org/web/<ts>id_/http://example.com/path
	// → outDir/path (append index.html for trailing slash)
	$m = [];
	if (!preg_match('#https?://web\.archive\.org/web/\d+id?_/((?:https?:)?//)?(.+)$#i', $archivedUrl, $m)) {
	// fallback: sanitize whole URL as filename
	$safe = preg_replace('/[^a-zA-Z0-9_.-]/', '_', $archivedUrl);
	return $this->outDir . '/' . $safe;
	}
	$hostAndPath = $m[2];

	if (str_starts_with($hostAndPath, 'http://') \|\| str_starts_with($hostAndPath, 'https://')) {
	$parts = parse_url($hostAndPath);
	$rel = ltrim($parts['path'] ?? '', '/');
	} else {
	$slash = strpos($hostAndPath, '/');
	$rel = ($slash === false) ? 'index.html' : substr($hostAndPath, $slash + 1);
	}
	if ($rel === '' \|\| str_ends_with($rel, '/')) {
	$rel .= 'index.html';
	}
	return $this->outDir . '/' . $rel;
	}

	private function isHtml(?string $contentType): bool
	{
	return $contentType !== null && stripos($contentType, 'text/html') !== false;
	}

	private function stripWaybackToolbar(string $html): string
	{
	// Remove toolbar comments and scripts
	$html = preg_replace(
	'#<!-- BEGIN WAYBACK TOOLBAR INSERT.*?END WAYBACK TOOLBAR INSERT -->#is',
	'',
	$html
	);
	$html = preg_replace(
	'#<script[^>]src="https?://web\.archive\.org/web/[^"]+/static/js/wbhack\.js"[^>]></script>#i',
	'',
	$html
	);
	return $html;
	}

	private function rewriteHtml(string $html, string $domain): string
	{
	// Use DOMDocument to adjust href/src; keep it tolerant
	$prev = libxml_use_internal_errors(true);
	$doc = new DOMDocument();
	$doc->preserveWhiteSpace = false;
	$doc->formatOutput = false;
	$encoding = mb_detect_encoding($html, ['UTF-8','ISO-8859-1','Windows-1252'], true) ?: 'UTF-8';
	if ($encoding !== 'UTF-8') {
	$html = mb_convert_encoding($html, 'HTML-ENTITIES', $encoding);
	}
	$doc->loadHTML($html, LIBXML_HTML_NOIMPLIED \| LIBXML_HTML_NODEFDTD);

	$xpath = new DOMXPath($doc);
	$nodes = $xpath->query('//@href \| //@src');
	foreach ($nodes as $attr) {
	/** @var DOMAttr $attr */
	$v = $attr->value;
	if ($v === '') continue;

	// Strip Wayback prefix if present
	$v = preg_replace('#https?://web\.archive\.org/web/\d+id?_/+#i', '/', $v);

	// If it's a full URL to our domain, make it path-based
	if (preg_match('#^https?://#i', $v)) {
	$parts = parse_url($v);
	$host = $parts['host'] ?? '';
	if ($host && str_contains($host, $domain)) {
	$v = $parts['path'] ?? '/';
	if (!empty($parts['query'])) {
	$v .= '?' . $parts['query'];
	}
	}
	}

	// Very light normalization: ensure relative paths don’t escape root
	$attr->value = $v;
	}

	// Fix inline styles url(web.archive.org/...) → url(...)
	$styleNodes = $xpath->query('//@style');
	foreach ($styleNodes as $attr) {
	$attr->value = preg_replace(
	'#url\((https?://web\.archive\.org/web/\d+id?_/)+#i',
	'url(',
	$attr->value
	);
	}

	$out = $doc->saveHTML();
	libxml_use_internal_errors($prev);

	return $out;
	}

	// --------------------- HTTP ---------------------

	private function httpJson(string $url): mixed
	{
	[$data, $ct] = $this->httpGet($url);
	if ($data === null) {
	throw new RuntimeException("HTTP failed: $url");
	}
	$json = json_decode($data, true);
	if (json_last_error() !== JSON_ERROR_NONE) {
	throw new RuntimeException("Invalid JSON from $url: " . json_last_error_msg());
	}
	return $json;
	}

	/**
	* @return array{0:?string,1:?string} body, content-type
	*/
	private function httpGet(string $url): array
	{
	$ch = curl_init($url);
	curl_setopt_array($ch, [
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_MAXREDIRS => 5,
	CURLOPT_TIMEOUT => $this->timeoutSeconds,
	CURLOPT_USERAGENT => $this->userAgent,
	CURLOPT_HEADER => true,
	]);
	$resp = curl_exec($ch);
	if ($resp === false) {
	$err = curl_error($ch);
	curl_close($ch);
	fwrite(STDERR, "curl error: $err\n");
	return [null, null];
	}

	$status = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
	$hdrSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
	$headersRaw = substr($resp, 0, $hdrSize);
	$body = substr($resp, $hdrSize);
	curl_close($ch);

	if ($status < 200 \|\| $status >= 400) {
	fwrite(STDERR, "HTTP $status: $url\n");
	return [null, null];
	}

	// Parse content-type
	$ct = null;
	foreach (explode("\r\n", $headersRaw) as $line) {
	if (stripos($line, 'Content-Type:') === 0) {
	$parts = explode(':', $line, 2);
	$ct = trim($parts[1] ?? '');
	break;
	}
	}
	return [$body, $ct];
	}

	// --------------------- FS ---------------------

	private function ensureDir(string $dir): void
	{
	if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
	throw new RuntimeException("Cannot create dir: $dir");
	}
	}
	}