philgyford · September 13, 2025 13:36
diff --git a/fix_href_src_images.py b/fix_href_src_images.py
 import argparse
 import os
 import logging
 import re
 from curl_cffi import requests
 import time

 # A script to download missing images used in href and src attributes
 #
 # If you have a directory that's an archive of your TypePad site at
 #   typepad/mysite.typepad.com/myblog/
 #
 # If you run
 #   $ ./scripts/fix_href_src_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
 #
 # It will find attributes something like this:
 #   src="https://mysite.typepad.com/.a/6a00d83451d49569e2010535f284ca970b-800wi"
 #   href="https://mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-pi"
 # and download the images, saving them to typepad/mysite.typepad.com/a/ with file extensions.
 # and change the links accordingly.

 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)


 def main(source_dir, original_domain):
    source_dir = f"{os.getcwd()}{os.sep}{source_dir}"

    if not os.path.isdir(source_dir):
        logger.error(
            "The source, {}, does not exist or is not a directory".format(source_dir)
        )
        exit()

    total_html_files_count = 0
    modified_html_files_count = 0
    images_success_count = 0
    images_failure_count = 0
    image_errors = {}

    for dirpath, dirs, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".html"):
                continue

            total_html_files_count += 1

            path = os.path.join(dirpath, file)
            depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1

            with open(path, "r+") as f:
                content = f.read()

                matches = re.findall(
                    rf' (?:src|href)="https?\:\/\/{original_domain}\/\.a\/([a-f0-9]+?-(?:\d\d0wi|pi))"',
                    content,
                )

                if len(matches) > 0:
                    logger.info(f"---\nGetting images from {path}")
                    # Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg")
                    filename_replacements = []

                    # There will probably be duplicates, so set() to remove, so we don't
                    # download things twice:
                    for match in set(matches):
                        # Not including filename, because the downloaded file will have
                        # an extension:
                        image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}"

                        image_url = f"https://{original_domain}/.a/{match}"

                        try:
                            r = requests.get(
                                image_url, timeout=30, impersonate="chrome"
                            )
                            r.raise_for_status()
                        except requests.exceptions.HTTPError as e:
                            logger.error(f"Error fetching {image_url}: {e}")
                            image_errors.setdefault(str(r.status_code), []).append(
                                image_url
                            )
                            images_failure_count += 1
                            continue
                        except requests.exceptions.RequestException as e:
                            logger.error(f"Exception fetching {image_url}: {e}")
                            image_errors.setdefault(str(r.status_code), []).append(
                                image_url
                            )
                            images_failure_count += 1
                            continue

                        cd = r.headers.get("content-disposition", None)
                        if cd:
                            # Get the actual filename of the downloaded file,
                            # which will include the extension:
                            filename_matches = re.findall("filename=(.+)", cd)
                            if len(filename_matches) > 0:
                                new_filename = filename_matches[0]

                                images_success_count += 1
                                # Save the image to the correct location
                                folder_path = os.path.dirname(image_path)
                                os.makedirs(folder_path, exist_ok=True)
                                image_path += new_filename

                                if not os.path.isfile(image_path):
                                    # Don't already have this file on disk
                                    with open(image_path, "wb") as image_file:
                                        logger.info(
                                            f"Fetched {image_url} and saved to {image_path}"
                                        )
                                        image_file.write(r.content)

                                filename_replacements.append((match, new_filename))

                        time.sleep(1.0)

                    if len(filename_replacements) > 0:
                        # Rewrite the HTML file to replace the paths
                        new_path = f"..{os.sep}" * (depth)

                        for replacement in filename_replacements:
                            content = re.sub(
                                rf' (src|href)="https?\:\/\/{original_domain}\/\.a\/{replacement[0]}"',
                                rf' \1="{new_path}a/{replacement[1]}"',
                                content,
                            )

                        logger.info(f"Rewriting {path}")
                        f.seek(0)
                        f.write(content)
                        f.truncate()

                        modified_html_files_count += 1

    logger.info(f"""---
 {total_html_files_count} total HTML files found
 {modified_html_files_count} HTML files modified
 {images_success_count} images fetched and saved
 {images_failure_count} images failed to download
 ---
 """)
    if len(image_errors):
        logger.info("Images that could not be fetched, by status code:")
        for status_code, urls in image_errors.items():
            logger.info(status_code)
            for url in urls:
                logger.info(url)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Finds og:image and twitter:image meta tags,
            downloads the image and fixes the paths.
        """
    )

    parser.add_argument(
        "--source", help="Path to the parent directory to crawl", required=True
    )

    parser.add_argument(
        "--original_domain",
        help="TypePad original_domain name e.g. joedoe.typepad.com",
        required=True,
    )

    args = parser.parse_args()

    logger.info(
        "Getting paths from '{}' for original_domain '{}'".format(
            args.source, args.original_domain
        )
    )

    main(args.source, args.original_domain)
diff --git a/fix_meta_images.py b/fix_meta_images.py
 import argparse
 import os
 import logging
 import re
 from curl_cffi import requests
 import time

 # A script to download missing images used in meta tags.
 #
 # If you have a directory that's an archive of your TypePad site at
 #   typepad/mysite.typepad.com/myblog/
 #
 # If you run
 #   $ ./scripts/fix_meta_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
 #
 # It will find <meta> tags like:
 #   <meta property="og:image" content="https://www.mynewsite.com/blog/typepad/mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-600wi" />
 #   <meta name="twitter:image" content="https://www.mynewsite.com/blog/typepad/mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-600wi" />
 # and download the image, saving it to typepad/mysite.typepad.com/a/6a00d83451d49569e2010535c4c058970c-600w.<ext>
 # and change the links accordingly.

 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)


 def main(source_dir, original_domain):
    source_dir = f"{os.getcwd()}{os.sep}{source_dir}"

    if not os.path.isdir(source_dir):
        logger.error(
            "The source, {}, does not exist or is not a directory".format(source_dir)
        )
        exit()

    total_html_files_count = 0
    modified_html_files_count = 0
    images_success_count = 0
    images_failure_count = 0
    image_errors = {}

    for dirpath, dirs, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".html"):
                continue

            total_html_files_count += 1

            path = os.path.join(dirpath, file)

            with open(path, "r+") as f:
                content = f.read()

                matches = re.findall(
                    rf':image" content\="https?\:\/\/.*?\/{original_domain}\/\.a\/(.*?-600wi)"',
                    content,
                )

                if len(matches) > 0:
                    logger.info(f"---\nGetting images from {path}")
                    # Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg")
                    filename_replacements = []

                    # There will probably be duplicates, so set() to remove, so we don't
                    # download things twice:
                    for match in set(matches):
                        # Not including filename, because the downloaded file will have
                        # an extension:
                        image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}"

                        if not os.path.isfile(image_path):
                            # Don't already have this file on disk

                            image_url = f"https://{original_domain}/.a/{match}"

                            try:
                                r = requests.get(
                                    image_url, timeout=30, impersonate="chrome"
                                )
                                r.raise_for_status()
                            except requests.exceptions.HTTPError as e:
                                logger.error(f"Error fetching {image_url}: {e}")
                                image_errors.setdefault(str(r.status_code), []).append(
                                    image_url
                                )
                                images_failure_count += 1
                                continue
                            except requests.exceptions.RequestException as e:
                                logger.error(f"Exception fetching {image_url}: {e}")
                                image_errors.setdefault(str(r.status_code), []).append(
                                    image_url
                                )
                                images_failure_count += 1
                                continue

                            cd = r.headers.get("content-disposition", None)
                            if cd:
                                # Get the actual filename of the downloaded file,
                                # which will include the extension:
                                filename_matches = re.findall("filename=(.+)", cd)
                                if len(filename_matches) > 0:
                                    new_filename = filename_matches[0]

                                    images_success_count += 1
                                    # Save the image to the correct location
                                    folder_path = os.path.dirname(image_path)
                                    os.makedirs(folder_path, exist_ok=True)
                                    image_path += new_filename
                                    with open(image_path, "wb") as image_file:
                                        logger.info(
                                            f"Fetched {image_url} and saved to {image_path}"
                                        )
                                        image_file.write(r.content)

                                    filename_replacements.append((match, new_filename))

                            time.sleep(1.0)

                    if len(filename_replacements) > 0:
                        # Rewrite the HTML file to replace the paths
                        for replacement in filename_replacements:
                            content = re.sub(
                                rf':image" content\="https?\:\/\/(.*?\/{original_domain})\/\.a\/{replacement[0]}"',
                                rf':image" content="https://\1/a/{replacement[1]}"',
                                content,
                            )

                        logger.info(f"Rewriting {path}")
                        f.seek(0)
                        f.write(content)
                        f.truncate()

                        modified_html_files_count += 1

    logger.info(f"""---
 {total_html_files_count} total HTML files found
 {modified_html_files_count} HTML files modified
 {images_success_count} images fetched and saved
 {images_failure_count} images failed to download
 ---
 """)
    if len(image_errors):
        logger.info("Images that could not be fetched, by status code:")
        for status_code, urls in image_errors.items():
            logger.info(status_code)
            for url in urls:
                logger.info(url)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Finds og:image and twitter:image meta tags,
            downloads the image and fixes the paths.
        """
    )

    parser.add_argument(
        "--source", help="Path to the parent directory to crawl", required=True
    )

    parser.add_argument(
        "--original_domain",
        help="TypePad original_domain name e.g. joedoe.typepad.com",
        required=True,
    )

    args = parser.parse_args()

    logger.info(
        "Getting paths from '{}' for original_domain '{}'".format(
            args.source, args.original_domain
        )
    )

    main(args.source, args.original_domain)
diff --git a/fix_popup_images.py b/fix_popup_images.py
 import argparse
 import os
 import logging
 import re
 from curl_cffi import requests
 import time

 # A script to fix one kind of pop-up images in a directory of TypePad HTML files
 #
 # If you have a directory that's an archive of your TypePad site at
 #   typepad/mysite.typepad.com/myblog/
 #
 # Then you would run
 #   $ ./scripts/fix_popup_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
 #
 # It would find any instances of links like this:
 #   href="https://mysite.typepad.com/.shared/image.html?/photos/uncategorized/an-image.jpg"
 # and download the image, saving it to typepad/mysite.typepad.com/photos/uncategorized/an-image.jpg
 # and change the link to something like:
 #   href="../../photos/uncategorized/an-image.jpg"

 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)


 def main(source_dir, original_domain):
    source_dir = f"{os.getcwd()}{os.sep}{source_dir}"

    if not os.path.isdir(source_dir):
        logger.error(
            "The source, {}, does not exist or is not a directory".format(source_dir)
        )
        exit()

    total_html_files_count = 0
    modified_html_files_count = 0
    images_success_count = 0
    images_failure_count = 0
    image_errors = {}

    for dirpath, dirs, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".html"):
                continue

            total_html_files_count += 1

            path = os.path.join(dirpath, file)
            depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1

            with open(path, "r+") as f:
                content = f.read()

                matches = re.findall(
                    rf'href\="https?\:\/\/{original_domain}\/\.shared\/image.html\?(.*?)"',
                    content,
                )

                if len(matches) > 0:
                    logger.info(f"---\nGetting images from {path}")
                    for match in matches:
                        image_path = f"{source_dir}{os.sep}..{match}"

                        if not os.path.isfile(image_path):
                            # Don't already have this file on disk

                            image_url = f"https://{original_domain}{match}"

                            try:
                                r = requests.get(
                                    image_url, timeout=30, impersonate="chrome"
                                )
                                r.raise_for_status()
                            except requests.exceptions.HTTPError as e:
                                logger.error(f"Error fetching {image_url}: {e}")
                                image_errors.setdefault(str(r.status_code), []).append(
                                    image_url
                                )
                                images_failure_count += 1
                                continue
                            except requests.exceptions.RequestException as e:
                                logger.error(f"Exception fetching {image_url}: {e}")
                                image_errors.setdefault(str(r.status_code), []).append(
                                    image_url
                                )
                                images_failure_count += 1
                                continue

                            images_success_count += 1
                            # Save the image to the correct location
                            folder_path = os.path.dirname(image_path)
                            os.makedirs(folder_path, exist_ok=True)
                            with open(image_path, "wb") as image_file:
                                logger.info(
                                    f"Fetched {image_url} and saved to {image_path}"
                                )
                                image_file.write(r.content)

                            time.sleep(0.5)

                    # Rewrite the HTML file to replace the paths
                    new_path = f"..{os.sep}" * (depth)
                    new_path = new_path[:-1]  # Remove final slash

                    new_content = re.sub(
                        rf'href\="https?\:\/\/{original_domain}\/\.shared\/image.html\?',
                        f'href="{new_path}',
                        content,
                    )

                    logger.info(f"Rewriting {path}")
                    f.seek(0)
                    f.write(new_content)
                    f.truncate()

                    modified_html_files_count += 1

    logger.info(f"""---
 {total_html_files_count} total HTML files found
 {modified_html_files_count} HTML files modified
 {images_success_count} images fetched and saved
 {images_failure_count} images failed to download
 ---
 """)
    if len(image_errors):
        logger.info("Images that could not be fetched, by status code:")
        for status_code, urls in image_errors.items():
            logger.info(status_code)
            for url in urls:
                logger.info(url)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Finds instances of image.html redirects in a directory
            of HTML files, downloads the linked image, and upates the link
        """
    )

    parser.add_argument(
        "--source", help="Path to the parent directory to crawl", required=True
    )

    parser.add_argument(
        "--original_domain",
        help="TypePad domain name e.g. joedoe.typepad.com",
        required=True,
    )

    args = parser.parse_args()

    logger.info(
        "Getting paths from '{}' for domain '{}'".format(
            args.source, args.original_domain
        )
    )

    main(args.source, args.original_domain)
diff --git a/fix_popup_pages.py b/fix_popup_pages.py
 import argparse
 import os
 import logging
 import re

 # A script to fix paths when JavaScript opens in a new window.
 #
 # If you have a directory that's an archive of your TypePad site at
 #   typepad/mysite.typepad.com/myblog/
 #
 # Then you would run
 #   $ ./scripts/fix_popup_pages.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
 #
 # It would find any instances of this:
 #   <a href="../../blog_post.html" onclick="window.open('http://mysite.typepad.com/myblog/blog_post.html'
 # and replace it with:
 #   <a href="../../blog_post.html" onclick="window.open('../../blog_post.html'

 logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)


 def main(source_dir, original_domain):
    source_dir = f"{os.getcwd()}{os.sep}{source_dir}"

    if not os.path.isdir(source_dir):
        logger.error(
            "The source, {}, does not exist or is not a directory".format(source_dir)
        )
        exit()

    total_html_files_count = 0
    modified_html_files_count = 0

    for dirpath, dirs, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".html"):
                continue

            total_html_files_count += 1

            path = os.path.join(dirpath, file)

            with open(path, "r+") as f:
                content = f.read()

                # <a href="../images/IMG_1499.html" onclick="window.open('http://mysite.typepad.com/myblog/images/IMG_1499.html'
                search_one = rf'<a href\="(.*?)" onclick\="window.open\(\'https?\://{original_domain}.*?\''

                # <a onclick="window.open('http://mysite.typepad.com/myblog/images/IMG_1577.html','popup','width=2272,height=1704,scrollbars=no,resizable=no,toolbar=no,directories=no,location=no,menubar=no,status=no,left=0,top=0'); return false" href="../images/IMG_1577.html">
                search_two = rf'<a onclick\="window.open\(\'https?\://{original_domain}[^"]*?\'([^>]*?)href\="(.*?)">'

                if re.search(search_one, content) or re.search(search_two, content):
                    # At least one instance to change

                    new_content = re.sub(
                        search_one,
                        '<a href="\\1" onclick="window.open(\'\\1\'',
                        content,
                    )
                    new_content = re.sub(
                        search_two,
                        '<a onclick="window.open(\'\\2\'\\1href="\\2">',
                        new_content,
                    )

                    logger.info(f"Rewriting {path}")
                    f.seek(0)
                    f.write(new_content)
                    f.truncate()

                    modified_html_files_count += 1

    logger.info(f"""---
 {total_html_files_count} total HTML files found
 {modified_html_files_count} HTML files modified
 ---
 """)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Fixes paths for JavaScript opening new windows
        """
    )

    parser.add_argument(
        "--source", help="Path to the parent directory to crawl", required=True
    )

    parser.add_argument(
        "--original_domain",
        help="TypePad domain name e.g. joedoe.typepad.com",
        required=True,
    )

    args = parser.parse_args()

    logger.info(
        "Getting paths from '{}' for domain '{}'".format(
            args.source, args.original_domain
        )
    )

    main(args.source, args.original_domain)
	import argparse
	import os
	import logging
	import re
	from curl_cffi import requests
	import time

	# A script to download missing images used in href and src attributes
	#
	# If you have a directory that's an archive of your TypePad site at
	# typepad/mysite.typepad.com/myblog/
	#
	# If you run
	# $ ./scripts/fix_href_src_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
	#
	# It will find attributes something like this:
	# src="https://mysite.typepad.com/.a/6a00d83451d49569e2010535f284ca970b-800wi"
	# href="https://mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-pi"
	# and download the images, saving them to typepad/mysite.typepad.com/a/ with file extensions.
	# and change the links accordingly.

	logging.basicConfig(level=logging.INFO, format="%(message)s")
	logger = logging.getLogger(__name__)


	def main(source_dir, original_domain):
	source_dir = f"{os.getcwd()}{os.sep}{source_dir}"

	if not os.path.isdir(source_dir):
	logger.error(
	"The source, {}, does not exist or is not a directory".format(source_dir)
	)
	exit()

	total_html_files_count = 0
	modified_html_files_count = 0
	images_success_count = 0
	images_failure_count = 0
	image_errors = {}

	for dirpath, dirs, files in os.walk(source_dir):
	for file in files:
	if not file.endswith(".html"):
	continue

	total_html_files_count += 1

	path = os.path.join(dirpath, file)
	depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1

	with open(path, "r+") as f:
	content = f.read()

	matches = re.findall(
	rf' (?:src\|href)="https?\:\/\/{original_domain}\/\.a\/([a-f0-9]+?-(?:\d\d0wi\|pi))"',
	content,
	)

	if len(matches) > 0:
	logger.info(f"---\nGetting images from {path}")
	# Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg")
	filename_replacements = []

	# There will probably be duplicates, so set() to remove, so we don't
	# download things twice:
	for match in set(matches):
	# Not including filename, because the downloaded file will have
	# an extension:
	image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}"

	image_url = f"https://{original_domain}/.a/{match}"

	try:
	r = requests.get(
	image_url, timeout=30, impersonate="chrome"
	)
	r.raise_for_status()
	except requests.exceptions.HTTPError as e:
	logger.error(f"Error fetching {image_url}: {e}")
	image_errors.setdefault(str(r.status_code), []).append(
	image_url
	)
	images_failure_count += 1
	continue
	except requests.exceptions.RequestException as e:
	logger.error(f"Exception fetching {image_url}: {e}")
	image_errors.setdefault(str(r.status_code), []).append(
	image_url
	)
	images_failure_count += 1
	continue

	cd = r.headers.get("content-disposition", None)
	if cd:
	# Get the actual filename of the downloaded file,
	# which will include the extension:
	filename_matches = re.findall("filename=(.+)", cd)
	if len(filename_matches) > 0:
	new_filename = filename_matches[0]

	images_success_count += 1
	# Save the image to the correct location
	folder_path = os.path.dirname(image_path)
	os.makedirs(folder_path, exist_ok=True)
	image_path += new_filename

	if not os.path.isfile(image_path):
	# Don't already have this file on disk
	with open(image_path, "wb") as image_file:
	logger.info(
	f"Fetched {image_url} and saved to {image_path}"
	)
	image_file.write(r.content)

	filename_replacements.append((match, new_filename))

	time.sleep(1.0)

	if len(filename_replacements) > 0:
	# Rewrite the HTML file to replace the paths
	new_path = f"..{os.sep}" * (depth)

	for replacement in filename_replacements:
	content = re.sub(
	rf' (src\|href)="https?\:\/\/{original_domain}\/\.a\/{replacement[0]}"',
	rf' \1="{new_path}a/{replacement[1]}"',
	content,
	)

	logger.info(f"Rewriting {path}")
	f.seek(0)
	f.write(content)
	f.truncate()

	modified_html_files_count += 1

	logger.info(f"""---
	{total_html_files_count} total HTML files found
	{modified_html_files_count} HTML files modified
	{images_success_count} images fetched and saved
	{images_failure_count} images failed to download
	---
	""")
	if len(image_errors):
	logger.info("Images that could not be fetched, by status code:")
	for status_code, urls in image_errors.items():
	logger.info(status_code)
	for url in urls:
	logger.info(url)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="""Finds og:image and twitter:image meta tags,
	downloads the image and fixes the paths.
	"""
	)

	parser.add_argument(
	"--source", help="Path to the parent directory to crawl", required=True
	)

	parser.add_argument(
	"--original_domain",
	help="TypePad original_domain name e.g. joedoe.typepad.com",
	required=True,
	)

	args = parser.parse_args()

	logger.info(
	"Getting paths from '{}' for original_domain '{}'".format(
	args.source, args.original_domain
	)
	)

	main(args.source, args.original_domain)
No results found