Skip to content

Instantly share code, notes, and snippets.

@cuibonobo
Last active August 12, 2024 17:05
Show Gist options
  • Save cuibonobo/68356953c1a9d18de794356ab5b55dda to your computer and use it in GitHub Desktop.
Save cuibonobo/68356953c1a9d18de794356ab5b55dda to your computer and use it in GitHub Desktop.
Clean an HTTrack archive so that it can be used as a static site. This particular script was used to convert a WordPress site into a static site. Note that this assumes that external sites are kept as absolute URLs. I've included some sample HTTrack options for example.com that work with this script.
import argparse
import distutils.dir_util
import glob
import os
import re
import shutil
import typing
import urllib.parse
import urllib.request
HTTRACK_TAG = '<!-- Added by HTTrack --><meta http-equiv="content-type" content="text/html;charset=UTF-8" /><!-- /Added by HTTrack -->'
# files that HTTrack may miss
EXTRA_FILES = ['favicon.ico', 'wp-includes/js/wp-emoji-release.min.js']
class HttrackIndexException(Exception):
pass
class IndexNotFound(Exception):
pass
def get_redirect_html(base_path: str = "/"):
return f'''<!DOCTYPE HTML>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<meta http-equiv="refresh" content="0; url={base_path}">
<script type="text/javascript">
window.location.href = "{base_path}"
</script>
<title>Page Redirection</title>
</head>
<body>
If you are not redirected automatically, follow this <a href='{base_path}'>link to the main site</a>.
</body>
</html>'''
def get_srcset_paths(srcset: str) -> typing.List[str]:
final_paths = []
raw_paths = srcset.split(",")
for raw_path in raw_paths:
path_parts = raw_path.split(" ")
path = " ".join(path_parts[:-1])
final_paths.append(path.strip())
return final_paths
def get_extra_files(site_url: str, dest_dir: str, paths: typing.List[str]) -> None:
for path in paths:
if path.startswith("/"):
path = path[1:]
dest_path = os.path.join(dest_dir, path)
if os.path.exists(dest_path):
continue
dirname = os.path.dirname(dest_path)
os.makedirs(dirname, exist_ok=True)
source_url = f"{site_url}/{path}"
try:
print(f"Downloading {source_url} ...", end=" ")
urllib.request.urlretrieve(source_url, dest_path)
print("DONE")
except Exception:
print("FAIL")
def get_httrack_source_dir(source_dir: str, netloc: str) -> str:
dirs = os.listdir(source_dir)
if 'index.html' in dirs:
# having an 'index.html' file is a good sign, but the HTTrack index doesn't count
with open(os.path.join(source_dir, 'index.html'), 'rt', encoding='utf-8') as f:
contents = f.read()
if contents.find("<title>List of available projects - HTTrack Website Copier</title>") > -1:
raise HttrackIndexException
return source_dir
# maybe this is the HTTrack project directory...
if netloc in dirs:
# recurse so that we ensure that this directory has an index file
return get_httrack_source_dir(os.path.join(source_dir, netloc), netloc)
raise IndexNotFound
def main(site_url: str, source_dir: str, dest_dir: str, base_path: typing.Optional[str] = None, clobber_dest: bool = False) -> None:
# check that URL has the correct format
parsed_url = urllib.parse.urlparse(site_url)
if parsed_url.scheme == '':
raise SystemExit("Scheme (e.g. 'https://') must be included in the given URL!")
if site_url.endswith("/"):
site_url = site_url[:-1]
# check that the source path is a valid HTTrack archive
try:
source_dir = get_httrack_source_dir(source_dir, parsed_url.netloc)
except IndexNotFound:
raise SystemExit("Source HTTrack directory doesn't have an index file!")
except HttrackIndexException:
raise SystemExit("The source folder looks like the HTTrack index! This should point to a specific project directory instead.")
# check that the destination directory doesn't exist
if os.path.exists(dest_dir):
if not clobber_dest:
raise SystemExit("The destination directory already exists!")
else:
shutil.rmtree(dest_dir)
# ensure that the base path is in the correct format
if base_path is None:
base_path = "/"
if not base_path.startswith("/"):
base_path = "/" + base_path
if len(base_path) > 1 and base_path.endswith("/"):
base_path = base_path[:-1]
# copy the site to its destination
distutils.dir_util.copy_tree(source_dir, dest_dir)
# remove extra indices
for f in glob.glob(os.path.join(dest_dir, 'index[a-z0-9]*.html')):
os.remove(f)
# remove all php files
for f in glob.glob(os.path.join(dest_dir, '*.php')):
os.remove(f)
for f in glob.glob(os.path.join(dest_dir, '**/*.php')):
os.remove(f)
# remove extra feed folders
for f in glob.glob(os.path.join(dest_dir, '**/feed')):
shutil.rmtree(f)
for f in glob.glob(os.path.join(dest_dir, '**/**/feed')):
shutil.rmtree(f)
# add obfuscated or missing files
get_extra_files(site_url, dest_dir, EXTRA_FILES)
# create a 404 page if it doesn't already exist
redirect_html = get_redirect_html(base_path)
path_404 = os.path.join(dest_dir, '404.html')
if not os.path.exists(path_404):
with open(path_404, 'wt', encoding='utf-8') as f:
f.write(redirect_html)
# walk entire directory tree
for root, _, files in os.walk(dest_dir):
# perform operations on all HTML files
html_files = glob.glob(os.path.join(root, '*.html'))
for html_file in html_files:
contents = None
with open(html_file, 'rt', encoding='utf-8') as f:
contents = f.read()
# replace the HTTrack tag
contents = contents.replace(HTTRACK_TAG, '')
# convert relative index.html links to point to the parent directory
contents = re.sub(r"href=(['\"])(?!http)(?:(.*)\/+)?index.html(['\"])", "href=\\1\\2/\\3", contents, flags=re.MULTILINE)
# convert references to site domain to an absolute URI
replacement_base_path = "" if base_path == "/" else base_path
contents = contents.replace(site_url, replacement_base_path)
# convert escaped references to site domain to an escaped absolute URI
escaped_site_url = re.sub(r'([/])', r'\\\1', site_url)
escaped_base_path = re.sub(r'([/])', r'\\\1', replacement_base_path)
contents = contents.replace(escaped_site_url, escaped_base_path)
# retrieve `srcset` images that HTTrack missed
srcsets = re.findall(r"srcset=['\"](.*)\d[xw]['\"]", contents, flags=re.MULTILINE)
for srcset in srcsets:
paths = get_srcset_paths(srcset)
get_extra_files(site_url, dest_dir, paths)
with open(html_file, 'wt', encoding='utf-8') as f:
f.write(contents)
# add index.html files to all directories that don't contain one
if 'index.html' not in files:
with open(os.path.join(root, 'index.html'), 'wt', encoding='utf-8') as f:
f.write(redirect_html)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Clean an HTTrack archive to use as a static site")
parser.add_argument('site_url', help="URL of the archived site")
parser.add_argument('source_dir', help="HTTrack source directory")
parser.add_argument('dest_dir', help="New directory for the static site")
parser.add_argument('--base', default=None, help="The base path that the static site will be served from")
parser.add_argument('--clobber', default=False, action='store_true', help="Overwrite the destination directory")
args = parser.parse_args()
main(args.site_url, args.source_dir, args.dest_dir, base_path=args.base, clobber_dest=args.clobber)
Near=0
Test=0
ParseAll=1
HTMLFirst=1
Cache=1
NoRecatch=0
Dos=0
Index=0
WordIndex=0
MailIndex=0
Log=1
RemoveTimeout=0
RemoveRateout=0
KeepAlive=1
FollowRobotsTxt=2
NoErrorPages=0
NoExternalPages=0
NoPwdInPages=0
NoQueryStrings=0
NoPurgeOldFiles=0
Cookies=1
CheckType=1
ParseJava=1
HTTP10=0
TolerantRequests=0
UpdateHack=1
URLHack=1
StoreAllInCache=0
LogType=0
UseHTTPProxyForFTP=1
Build=0
PrimaryScan=3
Travel=1
GlobalTravel=0
RewriteLinks=0
BuildString=%%h%%p/%%n%%q.%%t
Category=
MaxHtml=
MaxOther=
MaxAll=
MaxWait=
Sockets=
Retry=2
MaxTime=
TimeOut=6
RateOut=
UserID=Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)
Footer=(none)
AcceptLanguage=en, *
OtherHeaders=
DefaultReferer=
MaxRate=
WildCardFilters=+*.css +*.js -ad.doubleclick.net/* -mime:application/foobar%0d%0a+*.gif +*.jpg +*.jpeg +*.png +*.tif +*.bmp%0d%0a+*.zip +*.tar +*.tgz +*.gz +*.rar +*.z +*.exe%0d%0a+*.mov +*.mpg +*.mpeg +*.avi +*.asf +*.mp3 +*.mp2 +*.rm +*.wav +*.vob +*.qt +*.vid +*.ac3 +*.wma +*.wmv
Proxy=
Port=
Depth=
ExtDepth=0
MaxConn=
MaxLinks=
MIMEDefsExt1=
MIMEDefsExt2=
MIMEDefsExt3=
MIMEDefsExt4=
MIMEDefsExt5=
MIMEDefsExt6=
MIMEDefsExt7=
MIMEDefsExt8=
MIMEDefsMime1=
MIMEDefsMime2=
MIMEDefsMime3=
MIMEDefsMime4=
MIMEDefsMime5=
MIMEDefsMime6=
MIMEDefsMime7=
MIMEDefsMime8=
CurrentUrl=example.com
CurrentAction=0
CurrentURLList=
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment