Last active
September 13, 2025 13:36
-
-
Save philgyford/3cd3c7051b09c97958cde91f7d19b9fc to your computer and use it in GitHub Desktop.
Some Python scripts for fixing up things in an archive of a TypePad site created using SiteSucker. Run `pip install curl-cffi` first. See https://www.gyford.com/phil/writing/2025/09/13/typepad/ for more info.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import logging | |
| import re | |
| from curl_cffi import requests | |
| import time | |
| # A script to download missing images used in href and src attributes | |
| # | |
| # If you have a directory that's an archive of your TypePad site at | |
| # typepad/mysite.typepad.com/myblog/ | |
| # | |
| # If you run | |
| # $ ./scripts/fix_href_src_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com | |
| # | |
| # It will find attributes something like this: | |
| # src="https://mysite.typepad.com/.a/6a00d83451d49569e2010535f284ca970b-800wi" | |
| # href="https://mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-pi" | |
| # and download the images, saving them to typepad/mysite.typepad.com/a/ with file extensions. | |
| # and change the links accordingly. | |
| logging.basicConfig(level=logging.INFO, format="%(message)s") | |
| logger = logging.getLogger(__name__) | |
| def main(source_dir, original_domain): | |
| source_dir = f"{os.getcwd()}{os.sep}{source_dir}" | |
| if not os.path.isdir(source_dir): | |
| logger.error( | |
| "The source, {}, does not exist or is not a directory".format(source_dir) | |
| ) | |
| exit() | |
| total_html_files_count = 0 | |
| modified_html_files_count = 0 | |
| images_success_count = 0 | |
| images_failure_count = 0 | |
| image_errors = {} | |
| for dirpath, dirs, files in os.walk(source_dir): | |
| for file in files: | |
| if not file.endswith(".html"): | |
| continue | |
| total_html_files_count += 1 | |
| path = os.path.join(dirpath, file) | |
| depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1 | |
| with open(path, "r+") as f: | |
| content = f.read() | |
| matches = re.findall( | |
| rf' (?:src|href)="https?\:\/\/{original_domain}\/\.a\/([a-f0-9]+?-(?:\d\d0wi|pi))"', | |
| content, | |
| ) | |
| if len(matches) > 0: | |
| logger.info(f"---\nGetting images from {path}") | |
| # Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg") | |
| filename_replacements = [] | |
| # There will probably be duplicates, so set() to remove, so we don't | |
| # download things twice: | |
| for match in set(matches): | |
| # Not including filename, because the downloaded file will have | |
| # an extension: | |
| image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}" | |
| image_url = f"https://{original_domain}/.a/{match}" | |
| try: | |
| r = requests.get( | |
| image_url, timeout=30, impersonate="chrome" | |
| ) | |
| r.raise_for_status() | |
| except requests.exceptions.HTTPError as e: | |
| logger.error(f"Error fetching {image_url}: {e}") | |
| image_errors.setdefault(str(r.status_code), []).append( | |
| image_url | |
| ) | |
| images_failure_count += 1 | |
| continue | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Exception fetching {image_url}: {e}") | |
| image_errors.setdefault(str(r.status_code), []).append( | |
| image_url | |
| ) | |
| images_failure_count += 1 | |
| continue | |
| cd = r.headers.get("content-disposition", None) | |
| if cd: | |
| # Get the actual filename of the downloaded file, | |
| # which will include the extension: | |
| filename_matches = re.findall("filename=(.+)", cd) | |
| if len(filename_matches) > 0: | |
| new_filename = filename_matches[0] | |
| images_success_count += 1 | |
| # Save the image to the correct location | |
| folder_path = os.path.dirname(image_path) | |
| os.makedirs(folder_path, exist_ok=True) | |
| image_path += new_filename | |
| if not os.path.isfile(image_path): | |
| # Don't already have this file on disk | |
| with open(image_path, "wb") as image_file: | |
| logger.info( | |
| f"Fetched {image_url} and saved to {image_path}" | |
| ) | |
| image_file.write(r.content) | |
| filename_replacements.append((match, new_filename)) | |
| time.sleep(1.0) | |
| if len(filename_replacements) > 0: | |
| # Rewrite the HTML file to replace the paths | |
| new_path = f"..{os.sep}" * (depth) | |
| for replacement in filename_replacements: | |
| content = re.sub( | |
| rf' (src|href)="https?\:\/\/{original_domain}\/\.a\/{replacement[0]}"', | |
| rf' \1="{new_path}a/{replacement[1]}"', | |
| content, | |
| ) | |
| logger.info(f"Rewriting {path}") | |
| f.seek(0) | |
| f.write(content) | |
| f.truncate() | |
| modified_html_files_count += 1 | |
| logger.info(f"""--- | |
| {total_html_files_count} total HTML files found | |
| {modified_html_files_count} HTML files modified | |
| {images_success_count} images fetched and saved | |
| {images_failure_count} images failed to download | |
| --- | |
| """) | |
| if len(image_errors): | |
| logger.info("Images that could not be fetched, by status code:") | |
| for status_code, urls in image_errors.items(): | |
| logger.info(status_code) | |
| for url in urls: | |
| logger.info(url) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="""Finds og:image and twitter:image meta tags, | |
| downloads the image and fixes the paths. | |
| """ | |
| ) | |
| parser.add_argument( | |
| "--source", help="Path to the parent directory to crawl", required=True | |
| ) | |
| parser.add_argument( | |
| "--original_domain", | |
| help="TypePad original_domain name e.g. joedoe.typepad.com", | |
| required=True, | |
| ) | |
| args = parser.parse_args() | |
| logger.info( | |
| "Getting paths from '{}' for original_domain '{}'".format( | |
| args.source, args.original_domain | |
| ) | |
| ) | |
| main(args.source, args.original_domain) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import logging | |
| import re | |
| from curl_cffi import requests | |
| import time | |
| # A script to download missing images used in meta tags. | |
| # | |
| # If you have a directory that's an archive of your TypePad site at | |
| # typepad/mysite.typepad.com/myblog/ | |
| # | |
| # If you run | |
| # $ ./scripts/fix_meta_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com | |
| # | |
| # It will find <meta> tags like: | |
| # <meta property="og:image" content="https://www.mynewsite.com/blog/typepad/mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-600wi" /> | |
| # <meta name="twitter:image" content="https://www.mynewsite.com/blog/typepad/mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-600wi" /> | |
| # and download the image, saving it to typepad/mysite.typepad.com/a/6a00d83451d49569e2010535c4c058970c-600w.<ext> | |
| # and change the links accordingly. | |
| logging.basicConfig(level=logging.INFO, format="%(message)s") | |
| logger = logging.getLogger(__name__) | |
| def main(source_dir, original_domain): | |
| source_dir = f"{os.getcwd()}{os.sep}{source_dir}" | |
| if not os.path.isdir(source_dir): | |
| logger.error( | |
| "The source, {}, does not exist or is not a directory".format(source_dir) | |
| ) | |
| exit() | |
| total_html_files_count = 0 | |
| modified_html_files_count = 0 | |
| images_success_count = 0 | |
| images_failure_count = 0 | |
| image_errors = {} | |
| for dirpath, dirs, files in os.walk(source_dir): | |
| for file in files: | |
| if not file.endswith(".html"): | |
| continue | |
| total_html_files_count += 1 | |
| path = os.path.join(dirpath, file) | |
| with open(path, "r+") as f: | |
| content = f.read() | |
| matches = re.findall( | |
| rf':image" content\="https?\:\/\/.*?\/{original_domain}\/\.a\/(.*?-600wi)"', | |
| content, | |
| ) | |
| if len(matches) > 0: | |
| logger.info(f"---\nGetting images from {path}") | |
| # Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg") | |
| filename_replacements = [] | |
| # There will probably be duplicates, so set() to remove, so we don't | |
| # download things twice: | |
| for match in set(matches): | |
| # Not including filename, because the downloaded file will have | |
| # an extension: | |
| image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}" | |
| if not os.path.isfile(image_path): | |
| # Don't already have this file on disk | |
| image_url = f"https://{original_domain}/.a/{match}" | |
| try: | |
| r = requests.get( | |
| image_url, timeout=30, impersonate="chrome" | |
| ) | |
| r.raise_for_status() | |
| except requests.exceptions.HTTPError as e: | |
| logger.error(f"Error fetching {image_url}: {e}") | |
| image_errors.setdefault(str(r.status_code), []).append( | |
| image_url | |
| ) | |
| images_failure_count += 1 | |
| continue | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Exception fetching {image_url}: {e}") | |
| image_errors.setdefault(str(r.status_code), []).append( | |
| image_url | |
| ) | |
| images_failure_count += 1 | |
| continue | |
| cd = r.headers.get("content-disposition", None) | |
| if cd: | |
| # Get the actual filename of the downloaded file, | |
| # which will include the extension: | |
| filename_matches = re.findall("filename=(.+)", cd) | |
| if len(filename_matches) > 0: | |
| new_filename = filename_matches[0] | |
| images_success_count += 1 | |
| # Save the image to the correct location | |
| folder_path = os.path.dirname(image_path) | |
| os.makedirs(folder_path, exist_ok=True) | |
| image_path += new_filename | |
| with open(image_path, "wb") as image_file: | |
| logger.info( | |
| f"Fetched {image_url} and saved to {image_path}" | |
| ) | |
| image_file.write(r.content) | |
| filename_replacements.append((match, new_filename)) | |
| time.sleep(1.0) | |
| if len(filename_replacements) > 0: | |
| # Rewrite the HTML file to replace the paths | |
| for replacement in filename_replacements: | |
| content = re.sub( | |
| rf':image" content\="https?\:\/\/(.*?\/{original_domain})\/\.a\/{replacement[0]}"', | |
| rf':image" content="https://\1/a/{replacement[1]}"', | |
| content, | |
| ) | |
| logger.info(f"Rewriting {path}") | |
| f.seek(0) | |
| f.write(content) | |
| f.truncate() | |
| modified_html_files_count += 1 | |
| logger.info(f"""--- | |
| {total_html_files_count} total HTML files found | |
| {modified_html_files_count} HTML files modified | |
| {images_success_count} images fetched and saved | |
| {images_failure_count} images failed to download | |
| --- | |
| """) | |
| if len(image_errors): | |
| logger.info("Images that could not be fetched, by status code:") | |
| for status_code, urls in image_errors.items(): | |
| logger.info(status_code) | |
| for url in urls: | |
| logger.info(url) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="""Finds og:image and twitter:image meta tags, | |
| downloads the image and fixes the paths. | |
| """ | |
| ) | |
| parser.add_argument( | |
| "--source", help="Path to the parent directory to crawl", required=True | |
| ) | |
| parser.add_argument( | |
| "--original_domain", | |
| help="TypePad original_domain name e.g. joedoe.typepad.com", | |
| required=True, | |
| ) | |
| args = parser.parse_args() | |
| logger.info( | |
| "Getting paths from '{}' for original_domain '{}'".format( | |
| args.source, args.original_domain | |
| ) | |
| ) | |
| main(args.source, args.original_domain) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import logging | |
| import re | |
| from curl_cffi import requests | |
| import time | |
| # A script to fix one kind of pop-up images in a directory of TypePad HTML files | |
| # | |
| # If you have a directory that's an archive of your TypePad site at | |
| # typepad/mysite.typepad.com/myblog/ | |
| # | |
| # Then you would run | |
| # $ ./scripts/fix_popup_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com | |
| # | |
| # It would find any instances of links like this: | |
| # href="https://mysite.typepad.com/.shared/image.html?/photos/uncategorized/an-image.jpg" | |
| # and download the image, saving it to typepad/mysite.typepad.com/photos/uncategorized/an-image.jpg | |
| # and change the link to something like: | |
| # href="../../photos/uncategorized/an-image.jpg" | |
| logging.basicConfig(level=logging.INFO, format="%(message)s") | |
| logger = logging.getLogger(__name__) | |
| def main(source_dir, original_domain): | |
| source_dir = f"{os.getcwd()}{os.sep}{source_dir}" | |
| if not os.path.isdir(source_dir): | |
| logger.error( | |
| "The source, {}, does not exist or is not a directory".format(source_dir) | |
| ) | |
| exit() | |
| total_html_files_count = 0 | |
| modified_html_files_count = 0 | |
| images_success_count = 0 | |
| images_failure_count = 0 | |
| image_errors = {} | |
| for dirpath, dirs, files in os.walk(source_dir): | |
| for file in files: | |
| if not file.endswith(".html"): | |
| continue | |
| total_html_files_count += 1 | |
| path = os.path.join(dirpath, file) | |
| depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1 | |
| with open(path, "r+") as f: | |
| content = f.read() | |
| matches = re.findall( | |
| rf'href\="https?\:\/\/{original_domain}\/\.shared\/image.html\?(.*?)"', | |
| content, | |
| ) | |
| if len(matches) > 0: | |
| logger.info(f"---\nGetting images from {path}") | |
| for match in matches: | |
| image_path = f"{source_dir}{os.sep}..{match}" | |
| if not os.path.isfile(image_path): | |
| # Don't already have this file on disk | |
| image_url = f"https://{original_domain}{match}" | |
| try: | |
| r = requests.get( | |
| image_url, timeout=30, impersonate="chrome" | |
| ) | |
| r.raise_for_status() | |
| except requests.exceptions.HTTPError as e: | |
| logger.error(f"Error fetching {image_url}: {e}") | |
| image_errors.setdefault(str(r.status_code), []).append( | |
| image_url | |
| ) | |
| images_failure_count += 1 | |
| continue | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Exception fetching {image_url}: {e}") | |
| image_errors.setdefault(str(r.status_code), []).append( | |
| image_url | |
| ) | |
| images_failure_count += 1 | |
| continue | |
| images_success_count += 1 | |
| # Save the image to the correct location | |
| folder_path = os.path.dirname(image_path) | |
| os.makedirs(folder_path, exist_ok=True) | |
| with open(image_path, "wb") as image_file: | |
| logger.info( | |
| f"Fetched {image_url} and saved to {image_path}" | |
| ) | |
| image_file.write(r.content) | |
| time.sleep(0.5) | |
| # Rewrite the HTML file to replace the paths | |
| new_path = f"..{os.sep}" * (depth) | |
| new_path = new_path[:-1] # Remove final slash | |
| new_content = re.sub( | |
| rf'href\="https?\:\/\/{original_domain}\/\.shared\/image.html\?', | |
| f'href="{new_path}', | |
| content, | |
| ) | |
| logger.info(f"Rewriting {path}") | |
| f.seek(0) | |
| f.write(new_content) | |
| f.truncate() | |
| modified_html_files_count += 1 | |
| logger.info(f"""--- | |
| {total_html_files_count} total HTML files found | |
| {modified_html_files_count} HTML files modified | |
| {images_success_count} images fetched and saved | |
| {images_failure_count} images failed to download | |
| --- | |
| """) | |
| if len(image_errors): | |
| logger.info("Images that could not be fetched, by status code:") | |
| for status_code, urls in image_errors.items(): | |
| logger.info(status_code) | |
| for url in urls: | |
| logger.info(url) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="""Finds instances of image.html redirects in a directory | |
| of HTML files, downloads the linked image, and upates the link | |
| """ | |
| ) | |
| parser.add_argument( | |
| "--source", help="Path to the parent directory to crawl", required=True | |
| ) | |
| parser.add_argument( | |
| "--original_domain", | |
| help="TypePad domain name e.g. joedoe.typepad.com", | |
| required=True, | |
| ) | |
| args = parser.parse_args() | |
| logger.info( | |
| "Getting paths from '{}' for domain '{}'".format( | |
| args.source, args.original_domain | |
| ) | |
| ) | |
| main(args.source, args.original_domain) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| import logging | |
| import re | |
| # A script to fix paths when JavaScript opens in a new window. | |
| # | |
| # If you have a directory that's an archive of your TypePad site at | |
| # typepad/mysite.typepad.com/myblog/ | |
| # | |
| # Then you would run | |
| # $ ./scripts/fix_popup_pages.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com | |
| # | |
| # It would find any instances of this: | |
| # <a href="../../blog_post.html" onclick="window.open('http://mysite.typepad.com/myblog/blog_post.html' | |
| # and replace it with: | |
| # <a href="../../blog_post.html" onclick="window.open('../../blog_post.html' | |
| logging.basicConfig(level=logging.INFO, format="%(message)s") | |
| logger = logging.getLogger(__name__) | |
| def main(source_dir, original_domain): | |
| source_dir = f"{os.getcwd()}{os.sep}{source_dir}" | |
| if not os.path.isdir(source_dir): | |
| logger.error( | |
| "The source, {}, does not exist or is not a directory".format(source_dir) | |
| ) | |
| exit() | |
| total_html_files_count = 0 | |
| modified_html_files_count = 0 | |
| for dirpath, dirs, files in os.walk(source_dir): | |
| for file in files: | |
| if not file.endswith(".html"): | |
| continue | |
| total_html_files_count += 1 | |
| path = os.path.join(dirpath, file) | |
| with open(path, "r+") as f: | |
| content = f.read() | |
| # <a href="../images/IMG_1499.html" onclick="window.open('http://mysite.typepad.com/myblog/images/IMG_1499.html' | |
| search_one = rf'<a href\="(.*?)" onclick\="window.open\(\'https?\://{original_domain}.*?\'' | |
| # <a onclick="window.open('http://mysite.typepad.com/myblog/images/IMG_1577.html','popup','width=2272,height=1704,scrollbars=no,resizable=no,toolbar=no,directories=no,location=no,menubar=no,status=no,left=0,top=0'); return false" href="../images/IMG_1577.html"> | |
| search_two = rf'<a onclick\="window.open\(\'https?\://{original_domain}[^"]*?\'([^>]*?)href\="(.*?)">' | |
| if re.search(search_one, content) or re.search(search_two, content): | |
| # At least one instance to change | |
| new_content = re.sub( | |
| search_one, | |
| '<a href="\\1" onclick="window.open(\'\\1\'', | |
| content, | |
| ) | |
| new_content = re.sub( | |
| search_two, | |
| '<a onclick="window.open(\'\\2\'\\1href="\\2">', | |
| new_content, | |
| ) | |
| logger.info(f"Rewriting {path}") | |
| f.seek(0) | |
| f.write(new_content) | |
| f.truncate() | |
| modified_html_files_count += 1 | |
| logger.info(f"""--- | |
| {total_html_files_count} total HTML files found | |
| {modified_html_files_count} HTML files modified | |
| --- | |
| """) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="""Fixes paths for JavaScript opening new windows | |
| """ | |
| ) | |
| parser.add_argument( | |
| "--source", help="Path to the parent directory to crawl", required=True | |
| ) | |
| parser.add_argument( | |
| "--original_domain", | |
| help="TypePad domain name e.g. joedoe.typepad.com", | |
| required=True, | |
| ) | |
| args = parser.parse_args() | |
| logger.info( | |
| "Getting paths from '{}' for domain '{}'".format( | |
| args.source, args.original_domain | |
| ) | |
| ) | |
| main(args.source, args.original_domain) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment