Skip to content

Instantly share code, notes, and snippets.

@natzir
Last active March 21, 2025 11:45
Show Gist options
  • Save natzir/c36e770205858a2f6922ed90d1226a7c to your computer and use it in GitHub Desktop.
Save natzir/c36e770205858a2f6922ed90d1226a7c to your computer and use it in GitHub Desktop.
javascript:(function(){const MAX_RETRIES=3,RETRY_DELAY=2e3,TIMEOUT_MS=3e5;let url=window.location.origin,domain=new URL(url).hostname,archiveAPI="https://web.archive.org/web/timemap/json?url="+encodeURIComponent(url)+"&matchType=prefix&collapse=urlkey&output=json&fl=original,mimetype,timestamp,groupcount&filter=statuscode:200&mimetype:text/html",proxyOptions=["https://api.allorigins.win/get?disableCache=true&url="+encodeURIComponent(archiveAPI),"https://corsproxy.io/?"+encodeURIComponent(archiveAPI),"https://cors-proxy.taskcluster.net/"+encodeURIComponent(archiveAPI)],loadingMessage=document.createElement("div");function updateLoadingMessage(e){loadingMessage.innerHTML=e}function removeLoadingMessage(){document.body.contains(loadingMessage)&&document.body.removeChild(loadingMessage)}function fetchWithTimeout(e,t={},a=TIMEOUT_MS){return Promise.race([fetch(e,t),new Promise((e,t)=>setTimeout(()=>t(new Error("Request timeout - Wayback Machine might be busy")),a))])}async function tryProxies(e=0,t=0){if(t>=proxyOptions.length&&(t=0,++e),e>=MAX_RETRIES)throw new Error("Maximum number of retries exceeded. The Wayback Machine may be experiencing high traffic.");const a="Please wait: Contacting Wayback Machine...<br><br>This might take several minutes. The Wayback Machine servers can be slow at times.<br><br>Don%27t close this tab.",r=`<br><small>Attempt ${e+1}/${MAX_RETRIES} using proxy ${t+1}/${proxyOptions.length}</small>`;updateLoadingMessage(a+r);try{const n=proxyOptions[t];updateLoadingMessage(a+"<br><small>Sending request and waiting for response...</small>"+r);const i=await fetchWithTimeout(n);if(!i.ok)throw new Error(`HTTP error! status: ${i.status}`);updateLoadingMessage(a+"<br><small>Processing response data...</small>"+r);const s=await i.json();let o;if(n.includes("allorigins.win")){if(!s.contents)throw new Error("Empty response from proxy");o=JSON.parse(s.contents)}else o=s;return o}catch(n){console.warn(`Error with proxy ${t+1}:`,n);const i=a+`<br><small>Connection issue. Waiting to retry...</small>${r}`;return updateLoadingMessage(i),await new Promise(e=>setTimeout(e,RETRY_DELAY)),tryProxies(e,t+1)}}loadingMessage.innerText="Please wait: Connecting to Wayback Machine...\nThis might take a few minutes. Don%27t close this tab.",loadingMessage.style.position="fixed",loadingMessage.style.top="10px",loadingMessage.style.left="50%",loadingMessage.style.transform="translateX(-50%)",loadingMessage.style.backgroundColor="#000",loadingMessage.style.color="#fff",loadingMessage.style.padding="15px 20px",loadingMessage.style.borderRadius="5px",loadingMessage.style.zIndex="9999",loadingMessage.style.textAlign="center",loadingMessage.style.maxWidth="400px",loadingMessage.style.lineHeight="1.5",loadingMessage.style.fontFamily="Arial, sans-serif",document.body.appendChild(loadingMessage),tryProxies().then(e=>{if(removeLoadingMessage(),!Array.isArray(e)||e.length<2)return void alert("No archived URLs found on Archive.org for this domain. The site might not be archived or might be using robots.txt to prevent archiving.");updateLoadingMessage("Success! Preparing CSV file with archived URLs...");let t="Domain,URL,MIME Type,Timestamp,Group Count\n",a=0;e.slice(1).forEach(e=>{"text/html"===e[1]&&(t+=%60"${domain}","${e[0]}","${e[1]}","${e[2]}","${e[3]}"\n%60,a++)});let r=new Blob([t],{type:"text/csv"}),n=document.createElement("a");n.href=URL.createObjectURL(r),n.download="archive_urls_"+domain+".csv",document.body.appendChild(n),n.click(),document.body.removeChild(n),alert(%60Success! Downloaded ${a} archived URLs for ${domain}.\n\nThe CSV file contains all HTML pages that Wayback Machine has archived for this domain.%60)}).catch(e=>{removeLoadingMessage(),console.error("Error fetching data from Archive.org:",e),alert("Error: "+e.message+"\n\nYou can try accessing the Wayback Machine directly at:\nhttps://web.archive.org/web/*/"+url)})})();
@natzir
Copy link
Author

natzir commented Mar 19, 2025

Wayback Machine URL Extractor Bookmarklet

This bookmarklet extracts archived URLs from the Wayback Machine for any website you're visiting.

What It Does

  • Fetches all unique archived URLs (HTTP 200 status only) for the current domain
  • Filters for HTML pages only
  • Generates a CSV with domain, URL, MIME type, timestamp, and group count
  • Uses multiple CORS proxies with automatic retries

How to Use

  1. Copy the bookmarklet code
  2. Create a new bookmark in your browser
  3. Paste the code in the URL field
  4. Visit any website (eg: https://natzir.com)
  5. Click the bookmark
  6. Wait for processing (can take several minutes)
  7. A CSV file will download automatically

Tips

  • Be patient! The Wayback Machine API can be slow
  • Don't close the tab while running
  • Works best on domains with established archive history

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment