Last active
April 22, 2021 22:01
-
-
Save bitsgalore/46ac9279a2e18f784feb7372cf280b39 to your computer and use it in GitHub Desktop.
Saves URLs (from either list or root URL) to internet Archive's Wayback Machine
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# | |
""" | |
Save web pages to Wayback Machine. Argument urlsIn can either be | |
a text file with URLs (each line contains one URL), or a single | |
URL. In the first (input file) case it will simply save each URL. | |
In the latter case (input URL) it will extract all links from the URL, and | |
save those as well as the root URL (useful for saving a page with all | |
of its direct references). The optional --extensions argument can be used | |
to limit this to one or more specific file extensions. E.g. the following | |
will only save the root URL and any linked PDF and docx resources: | |
saveToWayback.py --extensions pdf,docx whatever.com/reports/ output.csv | |
Requirements: | |
- Waybackpy https://akamhy.github.io/waybackpy/ | |
- beautifulsoup4 https://www.crummy.com/software/BeautifulSoup/ | |
""" | |
import os | |
import sys | |
import csv | |
import re | |
import argparse | |
import requests | |
import waybackpy | |
from bs4 import BeautifulSoup | |
# Create parser | |
parser = argparse.ArgumentParser( | |
description="Save URLs to WayBack") | |
def parseCommandLine(): | |
"""Parse command line""" | |
# Add arguments | |
parser.add_argument('urlsIn', | |
action="store", | |
type=str, | |
help="either a file with URLs, or a single URL") | |
parser.add_argument('fileOut', | |
action="store", | |
type=str, | |
help="output file") | |
parser.add_argument('--extensions', '-e', | |
type=str, | |
help="comma-separated list of file extensions that control\ | |
the linked resources from input URL that are saved", | |
action='store', | |
dest='extString', | |
default="") | |
parser.add_argument('--maxtries', '-t', | |
type=int, | |
help="maximum number of tries to save each URL", | |
action='store', | |
dest='maxTries', | |
default=2) | |
# Parse arguments | |
args = parser.parse_args() | |
return args | |
def errorExit(msg): | |
"""Print warning to stderr and exit""" | |
msgString = ("Error: " + msg + "\n") | |
sys.stderr.write(msgString) | |
sys.exit() | |
def isURL(url): | |
""" | |
Check if string is URL | |
""" | |
# Source: https://stackoverflow.com/a/7160778/1209004 | |
regex = re.compile( | |
r'^(?:http|ftp)s?://' # http:// or https:// | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... | |
r'localhost|' #localhost... | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip | |
r'(?::\d+)?' # optional port | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
if re.match(regex, url) is not None: | |
result = True | |
else: | |
result = False | |
return result | |
def saveURL(url): | |
""" | |
Save one URL, return success flag and error message (in case of errors) | |
""" | |
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" | |
errorMsg = "" | |
wayback = waybackpy.Url(url, user_agent) | |
try: | |
archive = wayback.save() | |
url_archived = archive.archive_url | |
success = True | |
except waybackpy.exceptions.WaybackError as error: | |
success = False | |
url_archived = "" | |
errorMsg = error | |
return(url_archived, success, errorMsg) | |
def urlsFromFile(inputFile): | |
urls = [] | |
with open(inputFile) as fIn: | |
for line in fIn: | |
line = line.strip() | |
if line != "": | |
urls.append(line) | |
return list(set(urls)) | |
def urlsFromPage(inputURL, extensions): | |
urls = [inputURL] | |
reqs = requests.get(inputURL) | |
soup = BeautifulSoup(reqs.text, 'html.parser') | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if href != None: | |
#if ".pdf" in href: | |
if any(substring in href for substring in extensions): | |
urls.append(href) | |
return list(set(urls)) | |
def main(): | |
args = parseCommandLine() | |
urlsIn = args.urlsIn | |
fileOut = args.fileOut | |
extString = args.extString | |
extensions = extString.split(",") | |
maxTries = args.maxTries | |
# Get urls from input page or file | |
if isURL(urlsIn): | |
urls = urlsFromPage(urlsIn, extensions) | |
elif os.path.isfile(urlsIn): | |
urls = urlsFromFile(urlsIn) | |
else: | |
errorExit("urlsIn is neither a file nor a URL") | |
# Open output file in write mode | |
of = open(fileOut, "w", encoding="utf-8") | |
# Create CSV writer object | |
csvOut = csv.writer(of, lineterminator='\n') | |
# Write header row to output file | |
csvOut.writerow(["url", "url_archived", "success", "errorMsg"]) | |
for url in urls: | |
print("Processing URL: " + url) | |
success = False | |
tries = 0 | |
while not success and tries < maxTries: | |
url_archived, success, errorMsg = saveURL(url) | |
tries += 1 | |
print("Success: " + str(success)) | |
csvOut.writerow([url, url_archived, str(success), errorMsg]) | |
of.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment