Skip to content

Instantly share code, notes, and snippets.

@ortsed
Created November 8, 2017 21:55
Show Gist options
  • Save ortsed/a044a21688459548dc098df65f94e50a to your computer and use it in GitHub Desktop.
Save ortsed/a044a21688459548dc098df65f94e50a to your computer and use it in GitHub Desktop.
Get all versions from archive.org
# 1. Change the URL to the url of the site to be archived
# 2. run this script and pipe to an output text file FILENAME
# 3. Download the URLs via wget -i FILENAME
import json
from urllib.parse import urlencode, quote_plus
URL = "https://www.url.com"
archive_org_json_url = "https://web.archive.org/__wb/calendarcaptures?%s&selected_year=2017" % urlencode({"url": URL}, quote_via=quote_plus)
archive_org_json = get_json(archive_org_json_url)
def get_json(url):
text = get_url(url)
if is_json(text.data.decode('utf8')):
data = json.loads(text.data.decode('utf8'))
return data
else:
print("Invalid Json")
return False
def get_url(url):
import urllib3, certifi
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()
urllib3.disable_warnings()
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED', # Force certificate check.
ca_certs=certifi.where(), # Path to the Certifi bundle.
)
try:
r = http.request('GET', url)
return r
except urllib3.exceptions.SSLError as e:
print("SSL Error")
return e
# main
for val in archive_org_json:
for val1 in val:
for va in val1:
if va and "ts" in va:
for vas in va["ts"]:
print("https://web.archive.org/web/%s/%s" % (vas, URL))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment