Skip to content

Instantly share code, notes, and snippets.

@bitsgalore
Last active April 22, 2021 22:01
Show Gist options
  • Save bitsgalore/46ac9279a2e18f784feb7372cf280b39 to your computer and use it in GitHub Desktop.
Save bitsgalore/46ac9279a2e18f784feb7372cf280b39 to your computer and use it in GitHub Desktop.
Saves URLs (from either list or root URL) to internet Archive's Wayback Machine
#! /usr/bin/env python3
#
"""
Save web pages to Wayback Machine. Argument urlsIn can either be
a text file with URLs (each line contains one URL), or a single
URL. In the first (input file) case it will simply save each URL.
In the latter case (input URL) it will extract all links from the URL, and
save those as well as the root URL (useful for saving a page with all
of its direct references). The optional --extensions argument can be used
to limit this to one or more specific file extensions. E.g. the following
will only save the root URL and any linked PDF and docx resources:
saveToWayback.py --extensions pdf,docx whatever.com/reports/ output.csv
Requirements:
- Waybackpy https://akamhy.github.io/waybackpy/
- beautifulsoup4 https://www.crummy.com/software/BeautifulSoup/
"""
import os
import sys
import csv
import re
import argparse
import requests
import waybackpy
from bs4 import BeautifulSoup
# Create parser
parser = argparse.ArgumentParser(
description="Save URLs to WayBack")
def parseCommandLine():
"""Parse command line"""
# Add arguments
parser.add_argument('urlsIn',
action="store",
type=str,
help="either a file with URLs, or a single URL")
parser.add_argument('fileOut',
action="store",
type=str,
help="output file")
parser.add_argument('--extensions', '-e',
type=str,
help="comma-separated list of file extensions that control\
the linked resources from input URL that are saved",
action='store',
dest='extString',
default="")
parser.add_argument('--maxtries', '-t',
type=int,
help="maximum number of tries to save each URL",
action='store',
dest='maxTries',
default=2)
# Parse arguments
args = parser.parse_args()
return args
def errorExit(msg):
"""Print warning to stderr and exit"""
msgString = ("Error: " + msg + "\n")
sys.stderr.write(msgString)
sys.exit()
def isURL(url):
"""
Check if string is URL
"""
# Source: https://stackoverflow.com/a/7160778/1209004
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if re.match(regex, url) is not None:
result = True
else:
result = False
return result
def saveURL(url):
"""
Save one URL, return success flag and error message (in case of errors)
"""
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
errorMsg = ""
wayback = waybackpy.Url(url, user_agent)
try:
archive = wayback.save()
url_archived = archive.archive_url
success = True
except waybackpy.exceptions.WaybackError as error:
success = False
url_archived = ""
errorMsg = error
return(url_archived, success, errorMsg)
def urlsFromFile(inputFile):
urls = []
with open(inputFile) as fIn:
for line in fIn:
line = line.strip()
if line != "":
urls.append(line)
return list(set(urls))
def urlsFromPage(inputURL, extensions):
urls = [inputURL]
reqs = requests.get(inputURL)
soup = BeautifulSoup(reqs.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href != None:
#if ".pdf" in href:
if any(substring in href for substring in extensions):
urls.append(href)
return list(set(urls))
def main():
args = parseCommandLine()
urlsIn = args.urlsIn
fileOut = args.fileOut
extString = args.extString
extensions = extString.split(",")
maxTries = args.maxTries
# Get urls from input page or file
if isURL(urlsIn):
urls = urlsFromPage(urlsIn, extensions)
elif os.path.isfile(urlsIn):
urls = urlsFromFile(urlsIn)
else:
errorExit("urlsIn is neither a file nor a URL")
# Open output file in write mode
of = open(fileOut, "w", encoding="utf-8")
# Create CSV writer object
csvOut = csv.writer(of, lineterminator='\n')
# Write header row to output file
csvOut.writerow(["url", "url_archived", "success", "errorMsg"])
for url in urls:
print("Processing URL: " + url)
success = False
tries = 0
while not success and tries < maxTries:
url_archived, success, errorMsg = saveURL(url)
tries += 1
print("Success: " + str(success))
csvOut.writerow([url, url_archived, str(success), errorMsg])
of.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment