Last active
August 12, 2024 17:05
-
-
Save cuibonobo/68356953c1a9d18de794356ab5b55dda to your computer and use it in GitHub Desktop.
Clean an HTTrack archive so that it can be used as a static site. This particular script was used to convert a WordPress site into a static site. Note that this assumes that external sites are kept as absolute URLs. I've included some sample HTTrack options for example.com that work with this script.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import distutils.dir_util | |
import glob | |
import os | |
import re | |
import shutil | |
import typing | |
import urllib.parse | |
import urllib.request | |
HTTRACK_TAG = '<!-- Added by HTTrack --><meta http-equiv="content-type" content="text/html;charset=UTF-8" /><!-- /Added by HTTrack -->' | |
# files that HTTrack may miss | |
EXTRA_FILES = ['favicon.ico', 'wp-includes/js/wp-emoji-release.min.js'] | |
class HttrackIndexException(Exception): | |
pass | |
class IndexNotFound(Exception): | |
pass | |
def get_redirect_html(base_path: str = "/"): | |
return f'''<!DOCTYPE HTML> | |
<html lang="en-US"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta http-equiv="refresh" content="0; url={base_path}"> | |
<script type="text/javascript"> | |
window.location.href = "{base_path}" | |
</script> | |
<title>Page Redirection</title> | |
</head> | |
<body> | |
If you are not redirected automatically, follow this <a href='{base_path}'>link to the main site</a>. | |
</body> | |
</html>''' | |
def get_srcset_paths(srcset: str) -> typing.List[str]: | |
final_paths = [] | |
raw_paths = srcset.split(",") | |
for raw_path in raw_paths: | |
path_parts = raw_path.split(" ") | |
path = " ".join(path_parts[:-1]) | |
final_paths.append(path.strip()) | |
return final_paths | |
def get_extra_files(site_url: str, dest_dir: str, paths: typing.List[str]) -> None: | |
for path in paths: | |
if path.startswith("/"): | |
path = path[1:] | |
dest_path = os.path.join(dest_dir, path) | |
if os.path.exists(dest_path): | |
continue | |
dirname = os.path.dirname(dest_path) | |
os.makedirs(dirname, exist_ok=True) | |
source_url = f"{site_url}/{path}" | |
try: | |
print(f"Downloading {source_url} ...", end=" ") | |
urllib.request.urlretrieve(source_url, dest_path) | |
print("DONE") | |
except Exception: | |
print("FAIL") | |
def get_httrack_source_dir(source_dir: str, netloc: str) -> str: | |
dirs = os.listdir(source_dir) | |
if 'index.html' in dirs: | |
# having an 'index.html' file is a good sign, but the HTTrack index doesn't count | |
with open(os.path.join(source_dir, 'index.html'), 'rt', encoding='utf-8') as f: | |
contents = f.read() | |
if contents.find("<title>List of available projects - HTTrack Website Copier</title>") > -1: | |
raise HttrackIndexException | |
return source_dir | |
# maybe this is the HTTrack project directory... | |
if netloc in dirs: | |
# recurse so that we ensure that this directory has an index file | |
return get_httrack_source_dir(os.path.join(source_dir, netloc), netloc) | |
raise IndexNotFound | |
def main(site_url: str, source_dir: str, dest_dir: str, base_path: typing.Optional[str] = None, clobber_dest: bool = False) -> None: | |
# check that URL has the correct format | |
parsed_url = urllib.parse.urlparse(site_url) | |
if parsed_url.scheme == '': | |
raise SystemExit("Scheme (e.g. 'https://') must be included in the given URL!") | |
if site_url.endswith("/"): | |
site_url = site_url[:-1] | |
# check that the source path is a valid HTTrack archive | |
try: | |
source_dir = get_httrack_source_dir(source_dir, parsed_url.netloc) | |
except IndexNotFound: | |
raise SystemExit("Source HTTrack directory doesn't have an index file!") | |
except HttrackIndexException: | |
raise SystemExit("The source folder looks like the HTTrack index! This should point to a specific project directory instead.") | |
# check that the destination directory doesn't exist | |
if os.path.exists(dest_dir): | |
if not clobber_dest: | |
raise SystemExit("The destination directory already exists!") | |
else: | |
shutil.rmtree(dest_dir) | |
# ensure that the base path is in the correct format | |
if base_path is None: | |
base_path = "/" | |
if not base_path.startswith("/"): | |
base_path = "/" + base_path | |
if len(base_path) > 1 and base_path.endswith("/"): | |
base_path = base_path[:-1] | |
# copy the site to its destination | |
distutils.dir_util.copy_tree(source_dir, dest_dir) | |
# remove extra indices | |
for f in glob.glob(os.path.join(dest_dir, 'index[a-z0-9]*.html')): | |
os.remove(f) | |
# remove all php files | |
for f in glob.glob(os.path.join(dest_dir, '*.php')): | |
os.remove(f) | |
for f in glob.glob(os.path.join(dest_dir, '**/*.php')): | |
os.remove(f) | |
# remove extra feed folders | |
for f in glob.glob(os.path.join(dest_dir, '**/feed')): | |
shutil.rmtree(f) | |
for f in glob.glob(os.path.join(dest_dir, '**/**/feed')): | |
shutil.rmtree(f) | |
# add obfuscated or missing files | |
get_extra_files(site_url, dest_dir, EXTRA_FILES) | |
# create a 404 page if it doesn't already exist | |
redirect_html = get_redirect_html(base_path) | |
path_404 = os.path.join(dest_dir, '404.html') | |
if not os.path.exists(path_404): | |
with open(path_404, 'wt', encoding='utf-8') as f: | |
f.write(redirect_html) | |
# walk entire directory tree | |
for root, _, files in os.walk(dest_dir): | |
# perform operations on all HTML files | |
html_files = glob.glob(os.path.join(root, '*.html')) | |
for html_file in html_files: | |
contents = None | |
with open(html_file, 'rt', encoding='utf-8') as f: | |
contents = f.read() | |
# replace the HTTrack tag | |
contents = contents.replace(HTTRACK_TAG, '') | |
# convert relative index.html links to point to the parent directory | |
contents = re.sub(r"href=(['\"])(?!http)(?:(.*)\/+)?index.html(['\"])", "href=\\1\\2/\\3", contents, flags=re.MULTILINE) | |
# convert references to site domain to an absolute URI | |
replacement_base_path = "" if base_path == "/" else base_path | |
contents = contents.replace(site_url, replacement_base_path) | |
# convert escaped references to site domain to an escaped absolute URI | |
escaped_site_url = re.sub(r'([/])', r'\\\1', site_url) | |
escaped_base_path = re.sub(r'([/])', r'\\\1', replacement_base_path) | |
contents = contents.replace(escaped_site_url, escaped_base_path) | |
# retrieve `srcset` images that HTTrack missed | |
srcsets = re.findall(r"srcset=['\"](.*)\d[xw]['\"]", contents, flags=re.MULTILINE) | |
for srcset in srcsets: | |
paths = get_srcset_paths(srcset) | |
get_extra_files(site_url, dest_dir, paths) | |
with open(html_file, 'wt', encoding='utf-8') as f: | |
f.write(contents) | |
# add index.html files to all directories that don't contain one | |
if 'index.html' not in files: | |
with open(os.path.join(root, 'index.html'), 'wt', encoding='utf-8') as f: | |
f.write(redirect_html) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Clean an HTTrack archive to use as a static site") | |
parser.add_argument('site_url', help="URL of the archived site") | |
parser.add_argument('source_dir', help="HTTrack source directory") | |
parser.add_argument('dest_dir', help="New directory for the static site") | |
parser.add_argument('--base', default=None, help="The base path that the static site will be served from") | |
parser.add_argument('--clobber', default=False, action='store_true', help="Overwrite the destination directory") | |
args = parser.parse_args() | |
main(args.site_url, args.source_dir, args.dest_dir, base_path=args.base, clobber_dest=args.clobber) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Near=0 | |
Test=0 | |
ParseAll=1 | |
HTMLFirst=1 | |
Cache=1 | |
NoRecatch=0 | |
Dos=0 | |
Index=0 | |
WordIndex=0 | |
MailIndex=0 | |
Log=1 | |
RemoveTimeout=0 | |
RemoveRateout=0 | |
KeepAlive=1 | |
FollowRobotsTxt=2 | |
NoErrorPages=0 | |
NoExternalPages=0 | |
NoPwdInPages=0 | |
NoQueryStrings=0 | |
NoPurgeOldFiles=0 | |
Cookies=1 | |
CheckType=1 | |
ParseJava=1 | |
HTTP10=0 | |
TolerantRequests=0 | |
UpdateHack=1 | |
URLHack=1 | |
StoreAllInCache=0 | |
LogType=0 | |
UseHTTPProxyForFTP=1 | |
Build=0 | |
PrimaryScan=3 | |
Travel=1 | |
GlobalTravel=0 | |
RewriteLinks=0 | |
BuildString=%%h%%p/%%n%%q.%%t | |
Category= | |
MaxHtml= | |
MaxOther= | |
MaxAll= | |
MaxWait= | |
Sockets= | |
Retry=2 | |
MaxTime= | |
TimeOut=6 | |
RateOut= | |
UserID=Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98) | |
Footer=(none) | |
AcceptLanguage=en, * | |
OtherHeaders= | |
DefaultReferer= | |
MaxRate= | |
WildCardFilters=+*.css +*.js -ad.doubleclick.net/* -mime:application/foobar%0d%0a+*.gif +*.jpg +*.jpeg +*.png +*.tif +*.bmp%0d%0a+*.zip +*.tar +*.tgz +*.gz +*.rar +*.z +*.exe%0d%0a+*.mov +*.mpg +*.mpeg +*.avi +*.asf +*.mp3 +*.mp2 +*.rm +*.wav +*.vob +*.qt +*.vid +*.ac3 +*.wma +*.wmv | |
Proxy= | |
Port= | |
Depth= | |
ExtDepth=0 | |
MaxConn= | |
MaxLinks= | |
MIMEDefsExt1= | |
MIMEDefsExt2= | |
MIMEDefsExt3= | |
MIMEDefsExt4= | |
MIMEDefsExt5= | |
MIMEDefsExt6= | |
MIMEDefsExt7= | |
MIMEDefsExt8= | |
MIMEDefsMime1= | |
MIMEDefsMime2= | |
MIMEDefsMime3= | |
MIMEDefsMime4= | |
MIMEDefsMime5= | |
MIMEDefsMime6= | |
MIMEDefsMime7= | |
MIMEDefsMime8= | |
CurrentUrl=example.com | |
CurrentAction=0 | |
CurrentURLList= |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment