Skip to content

Instantly share code, notes, and snippets.

@philgyford
Last active September 13, 2025 13:36
Show Gist options
  • Save philgyford/3cd3c7051b09c97958cde91f7d19b9fc to your computer and use it in GitHub Desktop.
Save philgyford/3cd3c7051b09c97958cde91f7d19b9fc to your computer and use it in GitHub Desktop.
Some Python scripts for fixing up things in an archive of a TypePad site created using SiteSucker. Run `pip install curl-cffi` first. See https://www.gyford.com/phil/writing/2025/09/13/typepad/ for more info.
import argparse
import os
import logging
import re
from curl_cffi import requests
import time
# A script to download missing images used in href and src attributes
#
# If you have a directory that's an archive of your TypePad site at
# typepad/mysite.typepad.com/myblog/
#
# If you run
# $ ./scripts/fix_href_src_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
#
# It will find attributes something like this:
# src="https://mysite.typepad.com/.a/6a00d83451d49569e2010535f284ca970b-800wi"
# href="https://mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-pi"
# and download the images, saving them to typepad/mysite.typepad.com/a/ with file extensions.
# and change the links accordingly.
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
def main(source_dir, original_domain):
source_dir = f"{os.getcwd()}{os.sep}{source_dir}"
if not os.path.isdir(source_dir):
logger.error(
"The source, {}, does not exist or is not a directory".format(source_dir)
)
exit()
total_html_files_count = 0
modified_html_files_count = 0
images_success_count = 0
images_failure_count = 0
image_errors = {}
for dirpath, dirs, files in os.walk(source_dir):
for file in files:
if not file.endswith(".html"):
continue
total_html_files_count += 1
path = os.path.join(dirpath, file)
depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1
with open(path, "r+") as f:
content = f.read()
matches = re.findall(
rf' (?:src|href)="https?\:\/\/{original_domain}\/\.a\/([a-f0-9]+?-(?:\d\d0wi|pi))"',
content,
)
if len(matches) > 0:
logger.info(f"---\nGetting images from {path}")
# Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg")
filename_replacements = []
# There will probably be duplicates, so set() to remove, so we don't
# download things twice:
for match in set(matches):
# Not including filename, because the downloaded file will have
# an extension:
image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}"
image_url = f"https://{original_domain}/.a/{match}"
try:
r = requests.get(
image_url, timeout=30, impersonate="chrome"
)
r.raise_for_status()
except requests.exceptions.HTTPError as e:
logger.error(f"Error fetching {image_url}: {e}")
image_errors.setdefault(str(r.status_code), []).append(
image_url
)
images_failure_count += 1
continue
except requests.exceptions.RequestException as e:
logger.error(f"Exception fetching {image_url}: {e}")
image_errors.setdefault(str(r.status_code), []).append(
image_url
)
images_failure_count += 1
continue
cd = r.headers.get("content-disposition", None)
if cd:
# Get the actual filename of the downloaded file,
# which will include the extension:
filename_matches = re.findall("filename=(.+)", cd)
if len(filename_matches) > 0:
new_filename = filename_matches[0]
images_success_count += 1
# Save the image to the correct location
folder_path = os.path.dirname(image_path)
os.makedirs(folder_path, exist_ok=True)
image_path += new_filename
if not os.path.isfile(image_path):
# Don't already have this file on disk
with open(image_path, "wb") as image_file:
logger.info(
f"Fetched {image_url} and saved to {image_path}"
)
image_file.write(r.content)
filename_replacements.append((match, new_filename))
time.sleep(1.0)
if len(filename_replacements) > 0:
# Rewrite the HTML file to replace the paths
new_path = f"..{os.sep}" * (depth)
for replacement in filename_replacements:
content = re.sub(
rf' (src|href)="https?\:\/\/{original_domain}\/\.a\/{replacement[0]}"',
rf' \1="{new_path}a/{replacement[1]}"',
content,
)
logger.info(f"Rewriting {path}")
f.seek(0)
f.write(content)
f.truncate()
modified_html_files_count += 1
logger.info(f"""---
{total_html_files_count} total HTML files found
{modified_html_files_count} HTML files modified
{images_success_count} images fetched and saved
{images_failure_count} images failed to download
---
""")
if len(image_errors):
logger.info("Images that could not be fetched, by status code:")
for status_code, urls in image_errors.items():
logger.info(status_code)
for url in urls:
logger.info(url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Finds og:image and twitter:image meta tags,
downloads the image and fixes the paths.
"""
)
parser.add_argument(
"--source", help="Path to the parent directory to crawl", required=True
)
parser.add_argument(
"--original_domain",
help="TypePad original_domain name e.g. joedoe.typepad.com",
required=True,
)
args = parser.parse_args()
logger.info(
"Getting paths from '{}' for original_domain '{}'".format(
args.source, args.original_domain
)
)
main(args.source, args.original_domain)
import argparse
import os
import logging
import re
from curl_cffi import requests
import time
# A script to download missing images used in meta tags.
#
# If you have a directory that's an archive of your TypePad site at
# typepad/mysite.typepad.com/myblog/
#
# If you run
# $ ./scripts/fix_meta_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
#
# It will find <meta> tags like:
# <meta property="og:image" content="https://www.mynewsite.com/blog/typepad/mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-600wi" />
# <meta name="twitter:image" content="https://www.mynewsite.com/blog/typepad/mysite.typepad.com/.a/6a00d83451d49569e2010535c4c058970c-600wi" />
# and download the image, saving it to typepad/mysite.typepad.com/a/6a00d83451d49569e2010535c4c058970c-600w.<ext>
# and change the links accordingly.
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
def main(source_dir, original_domain):
source_dir = f"{os.getcwd()}{os.sep}{source_dir}"
if not os.path.isdir(source_dir):
logger.error(
"The source, {}, does not exist or is not a directory".format(source_dir)
)
exit()
total_html_files_count = 0
modified_html_files_count = 0
images_success_count = 0
images_failure_count = 0
image_errors = {}
for dirpath, dirs, files in os.walk(source_dir):
for file in files:
if not file.endswith(".html"):
continue
total_html_files_count += 1
path = os.path.join(dirpath, file)
with open(path, "r+") as f:
content = f.read()
matches = re.findall(
rf':image" content\="https?\:\/\/.*?\/{original_domain}\/\.a\/(.*?-600wi)"',
content,
)
if len(matches) > 0:
logger.info(f"---\nGetting images from {path}")
# Will contain tuples like: ("original_filename-600wi", "new_filename-600wi.jpg")
filename_replacements = []
# There will probably be duplicates, so set() to remove, so we don't
# download things twice:
for match in set(matches):
# Not including filename, because the downloaded file will have
# an extension:
image_path = f"{source_dir}{os.sep}..{os.sep}a{os.sep}"
if not os.path.isfile(image_path):
# Don't already have this file on disk
image_url = f"https://{original_domain}/.a/{match}"
try:
r = requests.get(
image_url, timeout=30, impersonate="chrome"
)
r.raise_for_status()
except requests.exceptions.HTTPError as e:
logger.error(f"Error fetching {image_url}: {e}")
image_errors.setdefault(str(r.status_code), []).append(
image_url
)
images_failure_count += 1
continue
except requests.exceptions.RequestException as e:
logger.error(f"Exception fetching {image_url}: {e}")
image_errors.setdefault(str(r.status_code), []).append(
image_url
)
images_failure_count += 1
continue
cd = r.headers.get("content-disposition", None)
if cd:
# Get the actual filename of the downloaded file,
# which will include the extension:
filename_matches = re.findall("filename=(.+)", cd)
if len(filename_matches) > 0:
new_filename = filename_matches[0]
images_success_count += 1
# Save the image to the correct location
folder_path = os.path.dirname(image_path)
os.makedirs(folder_path, exist_ok=True)
image_path += new_filename
with open(image_path, "wb") as image_file:
logger.info(
f"Fetched {image_url} and saved to {image_path}"
)
image_file.write(r.content)
filename_replacements.append((match, new_filename))
time.sleep(1.0)
if len(filename_replacements) > 0:
# Rewrite the HTML file to replace the paths
for replacement in filename_replacements:
content = re.sub(
rf':image" content\="https?\:\/\/(.*?\/{original_domain})\/\.a\/{replacement[0]}"',
rf':image" content="https://\1/a/{replacement[1]}"',
content,
)
logger.info(f"Rewriting {path}")
f.seek(0)
f.write(content)
f.truncate()
modified_html_files_count += 1
logger.info(f"""---
{total_html_files_count} total HTML files found
{modified_html_files_count} HTML files modified
{images_success_count} images fetched and saved
{images_failure_count} images failed to download
---
""")
if len(image_errors):
logger.info("Images that could not be fetched, by status code:")
for status_code, urls in image_errors.items():
logger.info(status_code)
for url in urls:
logger.info(url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Finds og:image and twitter:image meta tags,
downloads the image and fixes the paths.
"""
)
parser.add_argument(
"--source", help="Path to the parent directory to crawl", required=True
)
parser.add_argument(
"--original_domain",
help="TypePad original_domain name e.g. joedoe.typepad.com",
required=True,
)
args = parser.parse_args()
logger.info(
"Getting paths from '{}' for original_domain '{}'".format(
args.source, args.original_domain
)
)
main(args.source, args.original_domain)
import argparse
import os
import logging
import re
from curl_cffi import requests
import time
# A script to fix one kind of pop-up images in a directory of TypePad HTML files
#
# If you have a directory that's an archive of your TypePad site at
# typepad/mysite.typepad.com/myblog/
#
# Then you would run
# $ ./scripts/fix_popup_images.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
#
# It would find any instances of links like this:
# href="https://mysite.typepad.com/.shared/image.html?/photos/uncategorized/an-image.jpg"
# and download the image, saving it to typepad/mysite.typepad.com/photos/uncategorized/an-image.jpg
# and change the link to something like:
# href="../../photos/uncategorized/an-image.jpg"
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
def main(source_dir, original_domain):
source_dir = f"{os.getcwd()}{os.sep}{source_dir}"
if not os.path.isdir(source_dir):
logger.error(
"The source, {}, does not exist or is not a directory".format(source_dir)
)
exit()
total_html_files_count = 0
modified_html_files_count = 0
images_success_count = 0
images_failure_count = 0
image_errors = {}
for dirpath, dirs, files in os.walk(source_dir):
for file in files:
if not file.endswith(".html"):
continue
total_html_files_count += 1
path = os.path.join(dirpath, file)
depth = dirpath.count(os.sep) - source_dir.count(os.sep) + 1
with open(path, "r+") as f:
content = f.read()
matches = re.findall(
rf'href\="https?\:\/\/{original_domain}\/\.shared\/image.html\?(.*?)"',
content,
)
if len(matches) > 0:
logger.info(f"---\nGetting images from {path}")
for match in matches:
image_path = f"{source_dir}{os.sep}..{match}"
if not os.path.isfile(image_path):
# Don't already have this file on disk
image_url = f"https://{original_domain}{match}"
try:
r = requests.get(
image_url, timeout=30, impersonate="chrome"
)
r.raise_for_status()
except requests.exceptions.HTTPError as e:
logger.error(f"Error fetching {image_url}: {e}")
image_errors.setdefault(str(r.status_code), []).append(
image_url
)
images_failure_count += 1
continue
except requests.exceptions.RequestException as e:
logger.error(f"Exception fetching {image_url}: {e}")
image_errors.setdefault(str(r.status_code), []).append(
image_url
)
images_failure_count += 1
continue
images_success_count += 1
# Save the image to the correct location
folder_path = os.path.dirname(image_path)
os.makedirs(folder_path, exist_ok=True)
with open(image_path, "wb") as image_file:
logger.info(
f"Fetched {image_url} and saved to {image_path}"
)
image_file.write(r.content)
time.sleep(0.5)
# Rewrite the HTML file to replace the paths
new_path = f"..{os.sep}" * (depth)
new_path = new_path[:-1] # Remove final slash
new_content = re.sub(
rf'href\="https?\:\/\/{original_domain}\/\.shared\/image.html\?',
f'href="{new_path}',
content,
)
logger.info(f"Rewriting {path}")
f.seek(0)
f.write(new_content)
f.truncate()
modified_html_files_count += 1
logger.info(f"""---
{total_html_files_count} total HTML files found
{modified_html_files_count} HTML files modified
{images_success_count} images fetched and saved
{images_failure_count} images failed to download
---
""")
if len(image_errors):
logger.info("Images that could not be fetched, by status code:")
for status_code, urls in image_errors.items():
logger.info(status_code)
for url in urls:
logger.info(url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Finds instances of image.html redirects in a directory
of HTML files, downloads the linked image, and upates the link
"""
)
parser.add_argument(
"--source", help="Path to the parent directory to crawl", required=True
)
parser.add_argument(
"--original_domain",
help="TypePad domain name e.g. joedoe.typepad.com",
required=True,
)
args = parser.parse_args()
logger.info(
"Getting paths from '{}' for domain '{}'".format(
args.source, args.original_domain
)
)
main(args.source, args.original_domain)
import argparse
import os
import logging
import re
# A script to fix paths when JavaScript opens in a new window.
#
# If you have a directory that's an archive of your TypePad site at
# typepad/mysite.typepad.com/myblog/
#
# Then you would run
# $ ./scripts/fix_popup_pages.py --source=typepad/mysite.typepad.com/myblog/ --original_domain=mysite.typepad.com
#
# It would find any instances of this:
# <a href="../../blog_post.html" onclick="window.open('http://mysite.typepad.com/myblog/blog_post.html'
# and replace it with:
# <a href="../../blog_post.html" onclick="window.open('../../blog_post.html'
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__)
def main(source_dir, original_domain):
source_dir = f"{os.getcwd()}{os.sep}{source_dir}"
if not os.path.isdir(source_dir):
logger.error(
"The source, {}, does not exist or is not a directory".format(source_dir)
)
exit()
total_html_files_count = 0
modified_html_files_count = 0
for dirpath, dirs, files in os.walk(source_dir):
for file in files:
if not file.endswith(".html"):
continue
total_html_files_count += 1
path = os.path.join(dirpath, file)
with open(path, "r+") as f:
content = f.read()
# <a href="../images/IMG_1499.html" onclick="window.open('http://mysite.typepad.com/myblog/images/IMG_1499.html'
search_one = rf'<a href\="(.*?)" onclick\="window.open\(\'https?\://{original_domain}.*?\''
# <a onclick="window.open('http://mysite.typepad.com/myblog/images/IMG_1577.html','popup','width=2272,height=1704,scrollbars=no,resizable=no,toolbar=no,directories=no,location=no,menubar=no,status=no,left=0,top=0'); return false" href="../images/IMG_1577.html">
search_two = rf'<a onclick\="window.open\(\'https?\://{original_domain}[^"]*?\'([^>]*?)href\="(.*?)">'
if re.search(search_one, content) or re.search(search_two, content):
# At least one instance to change
new_content = re.sub(
search_one,
'<a href="\\1" onclick="window.open(\'\\1\'',
content,
)
new_content = re.sub(
search_two,
'<a onclick="window.open(\'\\2\'\\1href="\\2">',
new_content,
)
logger.info(f"Rewriting {path}")
f.seek(0)
f.write(new_content)
f.truncate()
modified_html_files_count += 1
logger.info(f"""---
{total_html_files_count} total HTML files found
{modified_html_files_count} HTML files modified
---
""")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""Fixes paths for JavaScript opening new windows
"""
)
parser.add_argument(
"--source", help="Path to the parent directory to crawl", required=True
)
parser.add_argument(
"--original_domain",
help="TypePad domain name e.g. joedoe.typepad.com",
required=True,
)
args = parser.parse_args()
logger.info(
"Getting paths from '{}' for domain '{}'".format(
args.source, args.original_domain
)
)
main(args.source, args.original_domain)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment