Skip to content

Instantly share code, notes, and snippets.

@md-riaz
Created October 15, 2025 04:38
Show Gist options
  • Save md-riaz/d66e58d1ae80948ee9698f59892ef0ed to your computer and use it in GitHub Desktop.
Save md-riaz/d66e58d1ae80948ee9698f59892ef0ed to your computer and use it in GitHub Desktop.
<?php
/**
* WaybackDownloader
*
* Single-file PHP class that:
* - Accepts a main URL or a Wayback snapshot URL
* - Auto-finds the latest snapshot if you supply a main URL
* - Queries CDX for a deduped list of 200-OK captures (up to maxPages)
* - Downloads HTML + assets, strips Wayback toolbar, rewrites links to local paths
* - Writes a static folder and a site.zip
*
* Requirements: PHP 8+, curl, ZipArchive, DOMDocument enabled
*
* ⚖️ Use responsibly. Only download content you have rights to. Respect IA Terms.
*/
class WaybackDownloader
{
private string $outDir;
private int $maxPages;
private int $perRequestDelayMs; // politeness delay between requests
private int $timeoutSeconds;
private string $userAgent;
public function __construct(
string $outDir = __DIR__ . '/dist/site',
int $maxPages = 150,
int $perRequestDelayMs = 200,
int $timeoutSeconds = 30,
string $userAgent = 'WaybackDownloader/1.0 (+https://example.org)'
) {
$this->outDir = rtrim($outDir, "/");
$this->maxPages = $maxPages;
$this->perRequestDelayMs = $perRequestDelayMs;
$this->timeoutSeconds = $timeoutSeconds;
$this->userAgent = $userAgent;
}
/**
* @param string $input Either a main URL (https://example.com) or a Wayback snapshot URL
* @param bool $latest When true and $input is a main URL, pick the newest snapshot
* @return array{outDir:string, zipPath:string}
* @throws RuntimeException
*/
public function download(string $input, bool $latest = true): array
{
$this->ensureDir($this->outDir);
if ($this->isWaybackUrl($input)) {
$domain = $this->domainFromWayback($input);
$timestamp = $this->timestampFromWayback($input);
} else {
$domain = $this->normalizeDomain($input);
if ($latest) {
$timestamp = $this->latestTimestampFor($domain);
} else {
throw new RuntimeException(
"A normal URL was provided but latest=false and no timestamp supplied. ".
"Pass a Wayback URL or set latest=true."
);
}
}
// 1) List URLs from CDX
$urls = $this->collectUrls($domain, $timestamp, $this->maxPages);
// 2) Download all
$count = 0;
foreach ($urls as $archivedUrl) {
$count++;
$this->fetchOne($archivedUrl, $domain);
if ($this->perRequestDelayMs > 0) {
usleep($this->perRequestDelayMs * 1000);
}
// Optional: print progress to console
if ($count % 10 === 0) {
fwrite(STDERR, "Downloaded: $count / " . count($urls) . PHP_EOL);
}
}
// 3) Zip output
$zipPath = $this->zipOutput();
return [
'outDir' => $this->outDir,
'zipPath' => $zipPath,
];
}
// --------------------- Core steps ---------------------
private function collectUrls(string $domain, ?string $timestamp, int $limit): array
{
// CDX API (JSON)
// - matchType=domain
// - filter statuscode:200
// - collapse by digest
// - If timestamp provided, we scope to that day (YYYYMMDD)
$params = [
'url' => $domain . '/*',
'matchType' => 'domain',
'output' => 'json',
'filter' => 'statuscode:200',
'collapse' => 'digest',
'limit' => (string) $limit,
];
if ($timestamp && strlen($timestamp) >= 8) {
$day = substr($timestamp, 0, 8);
$params['from'] = $day;
$params['to'] = $day;
}
$cdxUrl = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params);
$data = $this->httpJson($cdxUrl);
if (!is_array($data) || count($data) < 2) {
throw new RuntimeException("No CDX rows returned.");
}
$rows = array_slice($data, 1); // skip header
$urls = [];
foreach ($rows as $row) {
// Expected fields: original row is cdx default header list; safer to read by index:
// [0]=urlkey, [1]=timestamp, [2]=original, [3]=mimetype, [4]=statuscode, [5]=digest, [6]=length
$ts = $row[1] ?? null;
$orig = $row[2] ?? null;
if (!$ts || !$orig) continue;
$archived = "https://web.archive.org/web/{$ts}id_/{$orig}";
$urls[] = $archived;
}
return $urls;
}
private function fetchOne(string $archivedUrl, string $domain): void
{
[$data, $contentType] = $this->httpGet($archivedUrl);
if ($data === null) {
fwrite(STDERR, "Skip (null) $archivedUrl\n");
return;
}
$path = $this->localPathFor($archivedUrl);
$this->ensureDir(dirname($path));
if ($this->isHtml($contentType)) {
$html = $this->stripWaybackToolbar($data);
$html = $this->rewriteHtml($html, $domain);
file_put_contents($path, $html);
} else {
file_put_contents($path, $data);
}
}
private function zipOutput(): string
{
$zipPath = dirname($this->outDir) . '/site.zip';
$zip = new ZipArchive();
if ($zip->open($zipPath, ZipArchive::CREATE | ZipArchive::OVERWRITE) !== true) {
throw new RuntimeException("Unable to open zip: $zipPath");
}
$dirIt = new RecursiveDirectoryIterator($this->outDir, RecursiveDirectoryIterator::SKIP_DOTS);
$it = new RecursiveIteratorIterator($dirIt, RecursiveIteratorIterator::SELF_FIRST);
foreach ($it as $file) {
/** @var SplFileInfo $file */
$real = $file->getRealPath();
if ($real === false) continue;
$rel = substr($real, strlen($this->outDir) + 1);
if ($file->isDir()) {
$zip->addEmptyDir($rel);
} else {
$zip->addFile($real, $rel);
}
}
$zip->close();
return $zipPath;
}
// --------------------- Helpers ---------------------
private function isWaybackUrl(string $url): bool
{
return str_starts_with($url, 'https://web.archive.org/web/')
|| str_starts_with($url, 'http://web.archive.org/web/');
}
private function domainFromWayback(string $snapshotUrl): string
{
$m = [];
if (!preg_match('#/web/\d{8,14}/(.*)$#', $snapshotUrl, $m)) {
throw new RuntimeException("Invalid Wayback snapshot URL");
}
$rest = $m[1];
if (!str_contains($rest, '://')) {
$rest = 'http://' . $rest;
}
$parts = parse_url($rest);
return $parts['host'] ?? throw new RuntimeException("Cannot parse domain from snapshot URL");
}
private function timestampFromWayback(string $snapshotUrl): string
{
$m = [];
if (!preg_match('#/web/(\d{8,14})/#', $snapshotUrl, $m)) {
throw new RuntimeException("No timestamp found in snapshot URL");
}
return $m[1];
}
private function normalizeDomain(string $value): string
{
if (!str_contains($value, '://')) {
$value = 'http://' . $value;
}
$parts = parse_url($value);
return $parts['host'] ?? throw new RuntimeException("Invalid URL for domain");
}
private function latestTimestampFor(string $domain): string
{
$params = [
'url' => $domain . '/*',
'matchType' => 'domain',
'output' => 'json',
'filter' => 'statuscode:200',
'collapse' => 'digest',
'fl' => 'timestamp,original',
'limit' => '1',
'sort' => 'desc', // newest first
];
$url = 'https://web.archive.org/cdx/search/cdx?' . http_build_query($params);
$data = $this->httpJson($url);
if (!is_array($data) || count($data) < 2) {
throw new RuntimeException("No captures found for $domain");
}
return $data[1][0]; // first row after header → timestamp
}
private function localPathFor(string $archivedUrl): string
{
// Map https://web.archive.org/web/<ts>id_/http://example.com/path
// → outDir/path (append index.html for trailing slash)
$m = [];
if (!preg_match('#https?://web\.archive\.org/web/\d+id?_/((?:https?:)?//)?(.+)$#i', $archivedUrl, $m)) {
// fallback: sanitize whole URL as filename
$safe = preg_replace('/[^a-zA-Z0-9_.-]/', '_', $archivedUrl);
return $this->outDir . '/' . $safe;
}
$hostAndPath = $m[2];
if (str_starts_with($hostAndPath, 'http://') || str_starts_with($hostAndPath, 'https://')) {
$parts = parse_url($hostAndPath);
$rel = ltrim($parts['path'] ?? '', '/');
} else {
$slash = strpos($hostAndPath, '/');
$rel = ($slash === false) ? 'index.html' : substr($hostAndPath, $slash + 1);
}
if ($rel === '' || str_ends_with($rel, '/')) {
$rel .= 'index.html';
}
return $this->outDir . '/' . $rel;
}
private function isHtml(?string $contentType): bool
{
return $contentType !== null && stripos($contentType, 'text/html') !== false;
}
private function stripWaybackToolbar(string $html): string
{
// Remove toolbar comments and scripts
$html = preg_replace(
'#<!-- BEGIN WAYBACK TOOLBAR INSERT.*?END WAYBACK TOOLBAR INSERT -->#is',
'',
$html
);
$html = preg_replace(
'#<script[^>]*src="https?://web\.archive\.org/web/[^"]+/static/js/wbhack\.js"[^>]*></script>#i',
'',
$html
);
return $html;
}
private function rewriteHtml(string $html, string $domain): string
{
// Use DOMDocument to adjust href/src; keep it tolerant
$prev = libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->preserveWhiteSpace = false;
$doc->formatOutput = false;
$encoding = mb_detect_encoding($html, ['UTF-8','ISO-8859-1','Windows-1252'], true) ?: 'UTF-8';
if ($encoding !== 'UTF-8') {
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $encoding);
}
$doc->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$xpath = new DOMXPath($doc);
$nodes = $xpath->query('//@href | //@src');
foreach ($nodes as $attr) {
/** @var DOMAttr $attr */
$v = $attr->value;
if ($v === '') continue;
// Strip Wayback prefix if present
$v = preg_replace('#https?://web\.archive\.org/web/\d+id?_/+#i', '/', $v);
// If it's a full URL to our domain, make it path-based
if (preg_match('#^https?://#i', $v)) {
$parts = parse_url($v);
$host = $parts['host'] ?? '';
if ($host && str_contains($host, $domain)) {
$v = $parts['path'] ?? '/';
if (!empty($parts['query'])) {
$v .= '?' . $parts['query'];
}
}
}
// Very light normalization: ensure relative paths don’t escape root
$attr->value = $v;
}
// Fix inline styles url(web.archive.org/...) → url(...)
$styleNodes = $xpath->query('//@style');
foreach ($styleNodes as $attr) {
$attr->value = preg_replace(
'#url\((https?://web\.archive\.org/web/\d+id?_/)+#i',
'url(',
$attr->value
);
}
$out = $doc->saveHTML();
libxml_use_internal_errors($prev);
return $out;
}
// --------------------- HTTP ---------------------
private function httpJson(string $url): mixed
{
[$data, $ct] = $this->httpGet($url);
if ($data === null) {
throw new RuntimeException("HTTP failed: $url");
}
$json = json_decode($data, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new RuntimeException("Invalid JSON from $url: " . json_last_error_msg());
}
return $json;
}
/**
* @return array{0:?string,1:?string} body, content-type
*/
private function httpGet(string $url): array
{
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_TIMEOUT => $this->timeoutSeconds,
CURLOPT_USERAGENT => $this->userAgent,
CURLOPT_HEADER => true,
]);
$resp = curl_exec($ch);
if ($resp === false) {
$err = curl_error($ch);
curl_close($ch);
fwrite(STDERR, "curl error: $err\n");
return [null, null];
}
$status = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$hdrSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$headersRaw = substr($resp, 0, $hdrSize);
$body = substr($resp, $hdrSize);
curl_close($ch);
if ($status < 200 || $status >= 400) {
fwrite(STDERR, "HTTP $status: $url\n");
return [null, null];
}
// Parse content-type
$ct = null;
foreach (explode("\r\n", $headersRaw) as $line) {
if (stripos($line, 'Content-Type:') === 0) {
$parts = explode(':', $line, 2);
$ct = trim($parts[1] ?? '');
break;
}
}
return [$body, $ct];
}
// --------------------- FS ---------------------
private function ensureDir(string $dir): void
{
if (!is_dir($dir) && !mkdir($dir, 0777, true) && !is_dir($dir)) {
throw new RuntimeException("Cannot create dir: $dir");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment