cuibonobo · August 12, 2024 17:05
diff --git a/httrack_cleaner.py b/httrack_cleaner.py
 import argparse
 import distutils.dir_util
 import glob
 import os
 import re
 import shutil
 import typing
 import urllib.parse
 import urllib.request

 HTTRACK_TAG = '<!-- Added by HTTrack --><meta http-equiv="content-type" content="text/html;charset=UTF-8" /><!-- /Added by HTTrack -->'
 # files that HTTrack may miss
 EXTRA_FILES = ['favicon.ico', 'wp-includes/js/wp-emoji-release.min.js']

 class HttrackIndexException(Exception):
    pass

 class IndexNotFound(Exception):
    pass


 def get_redirect_html(base_path: str = "/"):
    return f'''<!DOCTYPE HTML>
 <html lang="en-US">
    <head>
        <meta charset="UTF-8">
        <meta http-equiv="refresh" content="0; url={base_path}">
        <script type="text/javascript">
            window.location.href = "{base_path}"
        </script>
        <title>Page Redirection</title>
    </head>
    <body>
        If you are not redirected automatically, follow this <a href='{base_path}'>link to the main site</a>.
    </body>
 </html>'''


 def get_srcset_paths(srcset: str) -> typing.List[str]:
    final_paths = []
    raw_paths = srcset.split(",")
    for raw_path in raw_paths:
        path_parts = raw_path.split(" ")
        path = " ".join(path_parts[:-1])
        final_paths.append(path.strip())
    return final_paths


 def get_extra_files(site_url: str, dest_dir: str, paths: typing.List[str]) -> None:
    for path in paths:
        if path.startswith("/"):
            path = path[1:]
        dest_path = os.path.join(dest_dir, path)
        if os.path.exists(dest_path):
            continue
        dirname = os.path.dirname(dest_path)
        os.makedirs(dirname, exist_ok=True)
        source_url = f"{site_url}/{path}"
        try:
            print(f"Downloading {source_url} ...", end=" ")
            urllib.request.urlretrieve(source_url, dest_path)
            print("DONE")
        except Exception:
            print("FAIL")


 def get_httrack_source_dir(source_dir: str, netloc: str) -> str:
    dirs = os.listdir(source_dir)
    if 'index.html' in dirs:
        # having an 'index.html' file is a good sign, but the HTTrack index doesn't count
        with open(os.path.join(source_dir, 'index.html'), 'rt', encoding='utf-8') as f:
            contents = f.read()
            if contents.find("<title>List of available projects - HTTrack Website Copier</title>") > -1:
                raise HttrackIndexException
        return source_dir
    # maybe this is the HTTrack project directory...
    if netloc in dirs:
        # recurse so that we ensure that this directory has an index file
        return get_httrack_source_dir(os.path.join(source_dir, netloc), netloc)
    raise IndexNotFound


 def main(site_url: str, source_dir: str, dest_dir: str, base_path: typing.Optional[str] = None, clobber_dest: bool = False) -> None:
    # check that URL has the correct format
    parsed_url = urllib.parse.urlparse(site_url)
    if parsed_url.scheme == '':
        raise SystemExit("Scheme (e.g. 'https://') must be included in the given URL!")
    if site_url.endswith("/"):
        site_url = site_url[:-1]
    
    # check that the source path is a valid HTTrack archive
    try:
        source_dir = get_httrack_source_dir(source_dir, parsed_url.netloc)
    except IndexNotFound:
        raise SystemExit("Source HTTrack directory doesn't have an index file!")
    except HttrackIndexException:
        raise SystemExit("The source folder looks like the HTTrack index! This should point to a specific project directory instead.")

    # check that the destination directory doesn't exist
    if os.path.exists(dest_dir):
        if not clobber_dest:
            raise SystemExit("The destination directory already exists!")
        else:
            shutil.rmtree(dest_dir)
    
    # ensure that the base path is in the correct format
    if base_path is None:
        base_path = "/"
    if not base_path.startswith("/"):
        base_path = "/" + base_path
    if len(base_path) > 1 and base_path.endswith("/"):
        base_path = base_path[:-1]

    # copy the site to its destination
    distutils.dir_util.copy_tree(source_dir, dest_dir)

    # remove extra indices
    for f in glob.glob(os.path.join(dest_dir, 'index[a-z0-9]*.html')):
        os.remove(f)

    # remove all php files
    for f in glob.glob(os.path.join(dest_dir, '*.php')):
        os.remove(f)

    for f in glob.glob(os.path.join(dest_dir, '**/*.php')):
        os.remove(f)

    # remove extra feed folders
    for f in glob.glob(os.path.join(dest_dir, '**/feed')):
        shutil.rmtree(f)

    for f in glob.glob(os.path.join(dest_dir, '**/**/feed')):
        shutil.rmtree(f)

    # add obfuscated or missing files
    get_extra_files(site_url, dest_dir, EXTRA_FILES)

    # create a 404 page if it doesn't already exist
    redirect_html = get_redirect_html(base_path)
    path_404 = os.path.join(dest_dir, '404.html')
    if not os.path.exists(path_404):
        with open(path_404, 'wt', encoding='utf-8') as f:
            f.write(redirect_html)

    # walk entire directory tree
    for root, _, files in os.walk(dest_dir):
        # perform operations on all HTML files
        html_files = glob.glob(os.path.join(root, '*.html'))
        for html_file in html_files:
            contents = None
            with open(html_file, 'rt', encoding='utf-8') as f:
                contents = f.read()
            # replace the HTTrack tag
            contents = contents.replace(HTTRACK_TAG, '')
            # convert relative index.html links to point to the parent directory
            contents = re.sub(r"href=(['\"])(?!http)(?:(.*)\/+)?index.html(['\"])", "href=\\1\\2/\\3", contents, flags=re.MULTILINE)
            # convert references to site domain to an absolute URI
            replacement_base_path = "" if base_path == "/" else base_path
            contents = contents.replace(site_url, replacement_base_path)
            # convert escaped references to site domain to an escaped absolute URI
            escaped_site_url = re.sub(r'([/])', r'\\\1', site_url)
            escaped_base_path = re.sub(r'([/])', r'\\\1', replacement_base_path)
            contents = contents.replace(escaped_site_url, escaped_base_path)
            # retrieve `srcset` images that HTTrack missed
            srcsets = re.findall(r"srcset=['\"](.*)\d[xw]['\"]", contents, flags=re.MULTILINE)
            for srcset in srcsets:
                paths = get_srcset_paths(srcset)
                get_extra_files(site_url, dest_dir, paths)
            with open(html_file, 'wt', encoding='utf-8') as f:
                f.write(contents)
        # add index.html files to all directories that don't contain one
        if 'index.html' not in files:
            with open(os.path.join(root, 'index.html'), 'wt', encoding='utf-8') as f:
                f.write(redirect_html)

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Clean an HTTrack archive to use as a static site")
    parser.add_argument('site_url', help="URL of the archived site")
    parser.add_argument('source_dir', help="HTTrack source directory")
    parser.add_argument('dest_dir', help="New directory for the static site")
    parser.add_argument('--base', default=None, help="The base path that the static site will be served from")
    parser.add_argument('--clobber', default=False, action='store_true', help="Overwrite the destination directory")

    args = parser.parse_args()
    main(args.site_url, args.source_dir, args.dest_dir, base_path=args.base, clobber_dest=args.clobber)
diff --git a/sample_httrack_config.opt b/sample_httrack_config.opt
 Near=0
 Test=0
 ParseAll=1
 HTMLFirst=1
 Cache=1
 NoRecatch=0
 Dos=0
 Index=0
 WordIndex=0
 MailIndex=0
 Log=1
 RemoveTimeout=0
 RemoveRateout=0
 KeepAlive=1
 FollowRobotsTxt=2
 NoErrorPages=0
 NoExternalPages=0
 NoPwdInPages=0
 NoQueryStrings=0
 NoPurgeOldFiles=0
 Cookies=1
 CheckType=1
 ParseJava=1
 HTTP10=0
 TolerantRequests=0
 UpdateHack=1
 URLHack=1
 StoreAllInCache=0
 LogType=0
 UseHTTPProxyForFTP=1
 Build=0
 PrimaryScan=3
 Travel=1
 GlobalTravel=0
 RewriteLinks=0
 BuildString=%%h%%p/%%n%%q.%%t
 Category=
 MaxHtml=
 MaxOther=
 MaxAll=
 MaxWait=
 Sockets=
 Retry=2
 MaxTime=
 TimeOut=6
 RateOut=
 UserID=Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)
 Footer=(none)
 AcceptLanguage=en, *
 OtherHeaders=
 DefaultReferer=
 MaxRate= 
 WildCardFilters=+*.css +*.js -ad.doubleclick.net/* -mime:application/foobar%0d%0a+*.gif +*.jpg +*.jpeg +*.png +*.tif +*.bmp%0d%0a+*.zip +*.tar +*.tgz +*.gz +*.rar +*.z +*.exe%0d%0a+*.mov +*.mpg +*.mpeg +*.avi +*.asf +*.mp3 +*.mp2 +*.rm +*.wav +*.vob +*.qt +*.vid +*.ac3 +*.wma +*.wmv
 Proxy=
 Port=
 Depth=
 ExtDepth=0
 MaxConn=
 MaxLinks=
 MIMEDefsExt1=
 MIMEDefsExt2=
 MIMEDefsExt3=
 MIMEDefsExt4=
 MIMEDefsExt5=
 MIMEDefsExt6=
 MIMEDefsExt7=
 MIMEDefsExt8=
 MIMEDefsMime1=
 MIMEDefsMime2=
 MIMEDefsMime3=
 MIMEDefsMime4=
 MIMEDefsMime5=
 MIMEDefsMime6=
 MIMEDefsMime7=
 MIMEDefsMime8=
 CurrentUrl=example.com
 CurrentAction=0
 CurrentURLList=
	import argparse
	import distutils.dir_util
	import glob
	import os
	import re
	import shutil
	import typing
	import urllib.parse
	import urllib.request

	HTTRACK_TAG = '<!-- Added by HTTrack --><meta http-equiv="content-type" content="text/html;charset=UTF-8" /><!-- /Added by HTTrack -->'
	# files that HTTrack may miss
	EXTRA_FILES = ['favicon.ico', 'wp-includes/js/wp-emoji-release.min.js']

	class HttrackIndexException(Exception):
	pass

	class IndexNotFound(Exception):
	pass


	def get_redirect_html(base_path: str = "/"):
	return f'''<!DOCTYPE HTML>
	<html lang="en-US">
	<head>
	<meta charset="UTF-8">
	<meta http-equiv="refresh" content="0; url={base_path}">
	<script type="text/javascript">
	window.location.href = "{base_path}"
	</script>
	<title>Page Redirection</title>
	</head>
	<body>
	If you are not redirected automatically, follow this <a href='{base_path}'>link to the main site</a>.
	</body>
	</html>'''


	def get_srcset_paths(srcset: str) -> typing.List[str]:
	final_paths = []
	raw_paths = srcset.split(",")
	for raw_path in raw_paths:
	path_parts = raw_path.split(" ")
	path = " ".join(path_parts[:-1])
	final_paths.append(path.strip())
	return final_paths


	def get_extra_files(site_url: str, dest_dir: str, paths: typing.List[str]) -> None:
	for path in paths:
	if path.startswith("/"):
	path = path[1:]
	dest_path = os.path.join(dest_dir, path)
	if os.path.exists(dest_path):
	continue
	dirname = os.path.dirname(dest_path)
	os.makedirs(dirname, exist_ok=True)
	source_url = f"{site_url}/{path}"
	try:
	print(f"Downloading {source_url} ...", end=" ")
	urllib.request.urlretrieve(source_url, dest_path)
	print("DONE")
	except Exception:
	print("FAIL")


	def get_httrack_source_dir(source_dir: str, netloc: str) -> str:
	dirs = os.listdir(source_dir)
	if 'index.html' in dirs:
	# having an 'index.html' file is a good sign, but the HTTrack index doesn't count
	with open(os.path.join(source_dir, 'index.html'), 'rt', encoding='utf-8') as f:
	contents = f.read()
	if contents.find("<title>List of available projects - HTTrack Website Copier</title>") > -1:
	raise HttrackIndexException
	return source_dir
	# maybe this is the HTTrack project directory...
	if netloc in dirs:
	# recurse so that we ensure that this directory has an index file
	return get_httrack_source_dir(os.path.join(source_dir, netloc), netloc)
	raise IndexNotFound


	def main(site_url: str, source_dir: str, dest_dir: str, base_path: typing.Optional[str] = None, clobber_dest: bool = False) -> None:
	# check that URL has the correct format
	parsed_url = urllib.parse.urlparse(site_url)
	if parsed_url.scheme == '':
	raise SystemExit("Scheme (e.g. 'https://') must be included in the given URL!")
	if site_url.endswith("/"):
	site_url = site_url[:-1]

	# check that the source path is a valid HTTrack archive
	try:
	source_dir = get_httrack_source_dir(source_dir, parsed_url.netloc)
	except IndexNotFound:
	raise SystemExit("Source HTTrack directory doesn't have an index file!")
	except HttrackIndexException:
	raise SystemExit("The source folder looks like the HTTrack index! This should point to a specific project directory instead.")

	# check that the destination directory doesn't exist
	if os.path.exists(dest_dir):
	if not clobber_dest:
	raise SystemExit("The destination directory already exists!")
	else:
	shutil.rmtree(dest_dir)

	# ensure that the base path is in the correct format
	if base_path is None:
	base_path = "/"
	if not base_path.startswith("/"):
	base_path = "/" + base_path
	if len(base_path) > 1 and base_path.endswith("/"):
	base_path = base_path[:-1]

	# copy the site to its destination
	distutils.dir_util.copy_tree(source_dir, dest_dir)

	# remove extra indices
	for f in glob.glob(os.path.join(dest_dir, 'index[a-z0-9]*.html')):
	os.remove(f)

	# remove all php files
	for f in glob.glob(os.path.join(dest_dir, '*.php')):
	os.remove(f)

	for f in glob.glob(os.path.join(dest_dir, '*/.php')):
	os.remove(f)

	# remove extra feed folders
	for f in glob.glob(os.path.join(dest_dir, '**/feed')):
	shutil.rmtree(f)

	for f in glob.glob(os.path.join(dest_dir, '//feed')):
	shutil.rmtree(f)

	# add obfuscated or missing files
	get_extra_files(site_url, dest_dir, EXTRA_FILES)

	# create a 404 page if it doesn't already exist
	redirect_html = get_redirect_html(base_path)
	path_404 = os.path.join(dest_dir, '404.html')
	if not os.path.exists(path_404):
	with open(path_404, 'wt', encoding='utf-8') as f:
	f.write(redirect_html)

	# walk entire directory tree
	for root, _, files in os.walk(dest_dir):
	# perform operations on all HTML files
	html_files = glob.glob(os.path.join(root, '*.html'))
	for html_file in html_files:
	contents = None
	with open(html_file, 'rt', encoding='utf-8') as f:
	contents = f.read()
	# replace the HTTrack tag
	contents = contents.replace(HTTRACK_TAG, '')
	# convert relative index.html links to point to the parent directory
	contents = re.sub(r"href=(['\"])(?!http)(?:(.*)\/+)?index.html(['\"])", "href=\\1\\2/\\3", contents, flags=re.MULTILINE)
	# convert references to site domain to an absolute URI
	replacement_base_path = "" if base_path == "/" else base_path
	contents = contents.replace(site_url, replacement_base_path)
	# convert escaped references to site domain to an escaped absolute URI
	escaped_site_url = re.sub(r'([/])', r'\\\1', site_url)
	escaped_base_path = re.sub(r'([/])', r'\\\1', replacement_base_path)
	contents = contents.replace(escaped_site_url, escaped_base_path)
	# retrieve `srcset` images that HTTrack missed
	srcsets = re.findall(r"srcset=['\"](.*)\d[xw]['\"]", contents, flags=re.MULTILINE)
	for srcset in srcsets:
	paths = get_srcset_paths(srcset)
	get_extra_files(site_url, dest_dir, paths)
	with open(html_file, 'wt', encoding='utf-8') as f:
	f.write(contents)
	# add index.html files to all directories that don't contain one
	if 'index.html' not in files:
	with open(os.path.join(root, 'index.html'), 'wt', encoding='utf-8') as f:
	f.write(redirect_html)

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description="Clean an HTTrack archive to use as a static site")
	parser.add_argument('site_url', help="URL of the archived site")
	parser.add_argument('source_dir', help="HTTrack source directory")
	parser.add_argument('dest_dir', help="New directory for the static site")
	parser.add_argument('--base', default=None, help="The base path that the static site will be served from")
	parser.add_argument('--clobber', default=False, action='store_true', help="Overwrite the destination directory")

	args = parser.parse_args()
	main(args.site_url, args.source_dir, args.dest_dir, base_path=args.base, clobber_dest=args.clobber)
	Near=0
	Test=0
	ParseAll=1
	HTMLFirst=1
	Cache=1
	NoRecatch=0
	Dos=0
	Index=0
	WordIndex=0
	MailIndex=0
	Log=1
	RemoveTimeout=0
	RemoveRateout=0
	KeepAlive=1
	FollowRobotsTxt=2
	NoErrorPages=0
	NoExternalPages=0
	NoPwdInPages=0
	NoQueryStrings=0
	NoPurgeOldFiles=0
	Cookies=1
	CheckType=1
	ParseJava=1
	HTTP10=0
	TolerantRequests=0
	UpdateHack=1
	URLHack=1
	StoreAllInCache=0
	LogType=0
	UseHTTPProxyForFTP=1
	Build=0
	PrimaryScan=3
	Travel=1
	GlobalTravel=0
	RewriteLinks=0
	BuildString=%%h%%p/%%n%%q.%%t
	Category=
	MaxHtml=
	MaxOther=
	MaxAll=
	MaxWait=
	Sockets=
	Retry=2
	MaxTime=
	TimeOut=6
	RateOut=
	UserID=Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)
	Footer=(none)
	AcceptLanguage=en, *
	OtherHeaders=
	DefaultReferer=
	MaxRate=
	WildCardFilters=+.css +.js -ad.doubleclick.net/* -mime:application/foobar%0d%0a+.gif +.jpg +.jpeg +.png +.tif +.bmp%0d%0a+.zip +.tar +.tgz +.gz +.rar +.z +.exe%0d%0a+.mov +.mpg +.mpeg +.avi +.asf +.mp3 +.mp2 +.rm +.wav +.vob +.qt +.vid +.ac3 +.wma +.wmv
	Proxy=
	Port=
	Depth=
	ExtDepth=0
	MaxConn=
	MaxLinks=
	MIMEDefsExt1=
	MIMEDefsExt2=
	MIMEDefsExt3=
	MIMEDefsExt4=
	MIMEDefsExt5=
	MIMEDefsExt6=
	MIMEDefsExt7=
	MIMEDefsExt8=
	MIMEDefsMime1=
	MIMEDefsMime2=
	MIMEDefsMime3=
	MIMEDefsMime4=
	MIMEDefsMime5=
	MIMEDefsMime6=
	MIMEDefsMime7=
	MIMEDefsMime8=
	CurrentUrl=example.com
	CurrentAction=0
	CurrentURLList=