Created
October 15, 2025 04:38
-
-
Save md-riaz/d66e58d1ae80948ee9698f59892ef0ed to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * WaybackDownloader | |
| * | |
| * Single-file PHP class that: | |
| * - Accepts a main URL or a Wayback snapshot URL | |
| * - Auto-finds the latest snapshot if you supply a main URL | |
| * - Queries CDX for a deduped list of 200-OK captures (up to maxPages) | |
| * - Downloads HTML + assets, strips Wayback toolbar, rewrites links to local paths | |
| * - Writes a static folder and a site.zip | |
| * | |
| * Requirements: PHP 8+, curl, ZipArchive, DOMDocument enabled | |
| * | |
| * ⚖️ Use responsibly. Only download content you have rights to. Respect IA Terms. | |
| */ | |
| class WaybackDownloader | |
| { | |
| private string $outDir; | |
| private int $maxPages; | |
| private int $perRequestDelayMs; // politeness delay between requests | |
| private int $timeoutSeconds; | |
| private string $userAgent; | |
| public function __construct( | |
| string $outDir = __DIR__ . '/dist/site', | |
| int $maxPages = 150, | |
| int $perRequestDelayMs = 200, | |
| int $timeoutSeconds = 30, | |
| string $userAgent = 'WaybackDownloader/1.0 (+https://example.org)' | |
| ) { | |
| $this->outDir = rtrim($outDir, "/"); | |
| $this->maxPages = $maxPages; | |
| $this->perRequestDelayMs = $perRequestDelayMs; | |
| $this->timeoutSeconds = $timeoutSeconds; | |
| $this->userAgent = $userAgent; | |
| } | |
| /** | |
| * @param string $input Either a main URL (https://example.com) or a Wayback snapshot URL | |
| * @param bool $latest When true and $input is a main URL, pick the newest snapshot | |
| * @return array{outDir:string, zipPath:string} | |
| * @throws RuntimeException | |
| */ | |
| public function download(string $input, bool $latest = true): array | |
| { | |
| $this->ensureDir($this->outDir); | |
| if ($this->isWaybackUrl($input)) { | |
| $domain = $this->domainFromWayback($input); | |
| $timestamp = $this->timestampFromWayback($input); | |
| } else { | |
| $domain = $this->normalizeDomain($input); | |
| if ($latest) { | |
| $timestamp = $this->latestTimestampFor($domain); | |
| } else { | |
| throw new RuntimeException( | |
| "A normal URL was provided but latest=false and no timestamp supplied. ". | |
| "Pass a Wayback URL or set latest=true." | |
| ); | |
| } | |
| } | |
| // 1) List URLs from CDX | |
| $urls = $this->collectUrls($domain, $timestamp, $this->maxPages); | |
| // 2) Download all | |
| $count = 0; | |
| foreach ($urls as $archivedUrl) { | |
| $count++; | |
| $this->fetchOne($archivedUrl, $domain); | |
| if ($this->perRequestDelayMs > 0) { | |
| usleep($this->perRequestDelayMs * 1000); | |
| } | |
| // Optional: print progress to console | |
| if ($count % 10 === 0) { | |
| fwrite(STDERR, "Downloaded: $count / " . count($urls) . PHP_EOL); | |
| } | |
| } | |
| // 3) Zip output | |
| $zipPath = $this->zipOutput(); | |
| return [ | |
| 'outDir' => $this->outDir, | |
| 'zipPath' => $zipPath, | |
| ]; | |
| } | |
| // --------------------- Core steps --------------------- | |
| private function collectUrls(string $domain, ?string $timestamp, int $limit): array | |
| { | |
| // CDX API (JSON) | |
| // - matchType=domain | |
| // - filter statuscode:200 | |
| // - collapse by digest | |
| // - If timestamp provided, we scope to that day (YYYYMMDD) | |
| $params = [ | |
| 'url' => $domain . '/*', | |
| 'matchType' => 'domain', | |
| 'output' => 'json', | |
| 'filter' => 'statuscode:200', | |
| 'collapse' => 'digest', | |
| 'limit' => (string) $limit, | |
| ]; | |
| if ($timestamp && strlen($timestamp) >= 8) { | |
| $day = substr($timestamp, 0, 8); | |
| $params['from'] = $day; | |
| $params['to'] = $day; | |
| } | |
| $cdxUrl = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params); | |
| $data = $this->httpJson($cdxUrl); | |
| if (!is_array($data) || count($data) < 2) { | |
| throw new RuntimeException("No CDX rows returned."); | |
| } | |
| $rows = array_slice($data, 1); // skip header | |
| $urls = []; | |
| foreach ($rows as $row) { | |
| // Expected fields: original row is cdx default header list; safer to read by index: | |
| // [0]=urlkey, [1]=timestamp, [2]=original, [3]=mimetype, [4]=statuscode, [5]=digest, [6]=length | |
| $ts = $row[1] ?? null; | |
| $orig = $row[2] ?? null; | |
| if (!$ts || !$orig) continue; | |
| $archived = "https://web.archive.org/web/{$ts}id_/{$orig}"; | |
| $urls[] = $archived; | |
| } | |
| return $urls; | |
| } | |
| private function fetchOne(string $archivedUrl, string $domain): void | |
| { | |
| [$data, $contentType] = $this->httpGet($archivedUrl); | |
| if ($data === null) { | |
| fwrite(STDERR, "Skip (null) $archivedUrl\n"); | |
| return; | |
| } | |
| $path = $this->localPathFor($archivedUrl); | |
| $this->ensureDir(dirname($path)); | |
| if ($this->isHtml($contentType)) { | |
| $html = $this->stripWaybackToolbar($data); | |
| $html = $this->rewriteHtml($html, $domain); | |
| file_put_contents($path, $html); | |
| } else { | |
| file_put_contents($path, $data); | |
| } | |
| } | |
| private function zipOutput(): string | |
| { | |
| $zipPath = dirname($this->outDir) . '/site.zip'; | |
| $zip = new ZipArchive(); | |
| if ($zip->open($zipPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) { | |
| throw new RuntimeException("Unable to open zip: $zipPath"); | |
| } | |
| $dirIt = new RecursiveDirectoryIterator($this->outDir, RecursiveDirectoryIterator::SKIP_DOTS); | |
| $it = new RecursiveIteratorIterator($dirIt, RecursiveIteratorIterator::SELF_FIRST); | |
| foreach ($it as $file) { | |
| /** @var SplFileInfo $file */ | |
| $real = $file->getRealPath(); | |
| if ($real === false) continue; | |
| $rel = substr($real, strlen($this->outDir) + 1); | |
| if ($file->isDir()) { | |
| $zip->addEmptyDir($rel); | |
| } else { | |
| $zip->addFile($real, $rel); | |
| } | |
| } | |
| $zip->close(); | |
| return $zipPath; | |
| } | |
| // --------------------- Helpers --------------------- | |
| private function isWaybackUrl(string $url): bool | |
| { | |
| return str_starts_with($url, 'https://web.archive.org/web/') | |
| || str_starts_with($url, 'http://web.archive.org/web/'); | |
| } | |
| private function domainFromWayback(string $snapshotUrl): string | |
| { | |
| $m = []; | |
| if (!preg_match('#/web/\d{8,14}/(.*)$#', $snapshotUrl, $m)) { | |
| throw new RuntimeException("Invalid Wayback snapshot URL"); | |
| } | |
| $rest = $m[1]; | |
| if (!str_contains($rest, '://')) { | |
| $rest = 'http://' . $rest; | |
| } | |
| $parts = parse_url($rest); | |
| return $parts['host'] ?? throw new RuntimeException("Cannot parse domain from snapshot URL"); | |
| } | |
| private function timestampFromWayback(string $snapshotUrl): string | |
| { | |
| $m = []; | |
| if (!preg_match('#/web/(\d{8,14})/#', $snapshotUrl, $m)) { | |
| throw new RuntimeException("No timestamp found in snapshot URL"); | |
| } | |
| return $m[1]; | |
| } | |
| private function normalizeDomain(string $value): string | |
| { | |
| if (!str_contains($value, '://')) { | |
| $value = 'http://' . $value; | |
| } | |
| $parts = parse_url($value); | |
| return $parts['host'] ?? throw new RuntimeException("Invalid URL for domain"); | |
| } | |
| private function latestTimestampFor(string $domain): string | |
| { | |
| $params = [ | |
| 'url' => $domain . '/*', | |
| 'matchType' => 'domain', | |
| 'output' => 'json', | |
| 'filter' => 'statuscode:200', | |
| 'collapse' => 'digest', | |
| 'fl' => 'timestamp,original', | |
| 'limit' => '1', | |
| 'sort' => 'desc', // newest first | |
| ]; | |
| $url = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params); | |
| $data = $this->httpJson($url); | |
| if (!is_array($data) || count($data) < 2) { | |
| throw new RuntimeException("No captures found for $domain"); | |
| } | |
| return $data[1][0]; // first row after header → timestamp | |
| } | |
| private function localPathFor(string $archivedUrl): string | |
| { | |
| // Map https://web.archive.org/web/<ts>id_/http://example.com/path | |
| // → outDir/path (append index.html for trailing slash) | |
| $m = []; | |
| if (!preg_match('#https?://web\.archive\.org/web/\d+id?_/((?:https?:)?//)?(.+)$#i', $archivedUrl, $m)) { | |
| // fallback: sanitize whole URL as filename | |
| $safe = preg_replace('/[^a-zA-Z0-9_.-]/', '_', $archivedUrl); | |
| return $this->outDir . '/' . $safe; | |
| } | |
| $hostAndPath = $m[2]; | |
| if (str_starts_with($hostAndPath, 'http://') || str_starts_with($hostAndPath, 'https://')) { | |
| $parts = parse_url($hostAndPath); | |
| $rel = ltrim($parts['path'] ?? '', '/'); | |
| } else { | |
| $slash = strpos($hostAndPath, '/'); | |
| $rel = ($slash === false) ? 'index.html' : substr($hostAndPath, $slash + 1); | |
| } | |
| if ($rel === '' || str_ends_with($rel, '/')) { | |
| $rel .= 'index.html'; | |
| } | |
| return $this->outDir . '/' . $rel; | |
| } | |
| private function isHtml(?string $contentType): bool | |
| { | |
| return $contentType !== null && stripos($contentType, 'text/html') !== false; | |
| } | |
| private function stripWaybackToolbar(string $html): string | |
| { | |
| // Remove toolbar comments and scripts | |
| $html = preg_replace( | |
| '#<!-- BEGIN WAYBACK TOOLBAR INSERT.*?END WAYBACK TOOLBAR INSERT -->#is', | |
| '', | |
| $html | |
| ); | |
| $html = preg_replace( | |
| '#<script[^>]*src="https?://web\.archive\.org/web/[^"]+/static/js/wbhack\.js"[^>]*></script>#i', | |
| '', | |
| $html | |
| ); | |
| return $html; | |
| } | |
| private function rewriteHtml(string $html, string $domain): string | |
| { | |
| // Use DOMDocument to adjust href/src; keep it tolerant | |
| $prev = libxml_use_internal_errors(true); | |
| $doc = new DOMDocument(); | |
| $doc->preserveWhiteSpace = false; | |
| $doc->formatOutput = false; | |
| $encoding = mb_detect_encoding($html, ['UTF-8','ISO-8859-1','Windows-1252'], true) ?: 'UTF-8'; | |
| if ($encoding !== 'UTF-8') { | |
| $html = mb_convert_encoding($html, 'HTML-ENTITIES', $encoding); | |
| } | |
| $doc->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); | |
| $xpath = new DOMXPath($doc); | |
| $nodes = $xpath->query('//@href | //@src'); | |
| foreach ($nodes as $attr) { | |
| /** @var DOMAttr $attr */ | |
| $v = $attr->value; | |
| if ($v === '') continue; | |
| // Strip Wayback prefix if present | |
| $v = preg_replace('#https?://web\.archive\.org/web/\d+id?_/+#i', '/', $v); | |
| // If it's a full URL to our domain, make it path-based | |
| if (preg_match('#^https?://#i', $v)) { | |
| $parts = parse_url($v); | |
| $host = $parts['host'] ?? ''; | |
| if ($host && str_contains($host, $domain)) { | |
| $v = $parts['path'] ?? '/'; | |
| if (!empty($parts['query'])) { | |
| $v .= '?' . $parts['query']; | |
| } | |
| } | |
| } | |
| // Very light normalization: ensure relative paths don’t escape root | |
| $attr->value = $v; | |
| } | |
| // Fix inline styles url(web.archive.org/...) → url(...) | |
| $styleNodes = $xpath->query('//@style'); | |
| foreach ($styleNodes as $attr) { | |
| $attr->value = preg_replace( | |
| '#url\((https?://web\.archive\.org/web/\d+id?_/)+#i', | |
| 'url(', | |
| $attr->value | |
| ); | |
| } | |
| $out = $doc->saveHTML(); | |
| libxml_use_internal_errors($prev); | |
| return $out; | |
| } | |
| // --------------------- HTTP --------------------- | |
| private function httpJson(string $url): mixed | |
| { | |
| [$data, $ct] = $this->httpGet($url); | |
| if ($data === null) { | |
| throw new RuntimeException("HTTP failed: $url"); | |
| } | |
| $json = json_decode($data, true); | |
| if (json_last_error() !== JSON_ERROR_NONE) { | |
| throw new RuntimeException("Invalid JSON from $url: " . json_last_error_msg()); | |
| } | |
| return $json; | |
| } | |
| /** | |
| * @return array{0:?string,1:?string} body, content-type | |
| */ | |
| private function httpGet(string $url): array | |
| { | |
| $ch = curl_init($url); | |
| curl_setopt_array($ch, [ | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_FOLLOWLOCATION => true, | |
| CURLOPT_MAXREDIRS => 5, | |
| CURLOPT_TIMEOUT => $this->timeoutSeconds, | |
| CURLOPT_USERAGENT => $this->userAgent, | |
| CURLOPT_HEADER => true, | |
| ]); | |
| $resp = curl_exec($ch); | |
| if ($resp === false) { | |
| $err = curl_error($ch); | |
| curl_close($ch); | |
| fwrite(STDERR, "curl error: $err\n"); | |
| return [null, null]; | |
| } | |
| $status = curl_getinfo($ch, CURLINFO_RESPONSE_CODE); | |
| $hdrSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | |
| $headersRaw = substr($resp, 0, $hdrSize); | |
| $body = substr($resp, $hdrSize); | |
| curl_close($ch); | |
| if ($status < 200 || $status >= 400) { | |
| fwrite(STDERR, "HTTP $status: $url\n"); | |
| return [null, null]; | |
| } | |
| // Parse content-type | |
| $ct = null; | |
| foreach (explode("\r\n", $headersRaw) as $line) { | |
| if (stripos($line, 'Content-Type:') === 0) { | |
| $parts = explode(':', $line, 2); | |
| $ct = trim($parts[1] ?? ''); | |
| break; | |
| } | |
| } | |
| return [$body, $ct]; | |
| } | |
| // --------------------- FS --------------------- | |
| private function ensureDir(string $dir): void | |
| { | |
| if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) { | |
| throw new RuntimeException("Cannot create dir: $dir"); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment