Created
February 10, 2025 00:43
-
-
Save MattGrayYes/940efeeacaf7f0a813d873a9ac0c2db2 to your computer and use it in GitHub Desktop.
Substack data downloads don't include the post images, only the HTML. This script reads all the html files in the current folder, and downloads the image from each url in the srcset into the ./images folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Requires wget, which doesnt come as standard on a mac. Can be installed via homebrew. | |
| # Written with help from ChatGPT-o3-mini-high | |
| # Trap SIGINT (Ctrl+C) so the whole script exits when interrupted. | |
| trap "exit" INT | |
| mkdir -p images | |
| for file in *.html; do | |
| base="${file%.html}" | |
| image_index=1 | |
| # Instead of piping into the while loop (which spawns a subshell), | |
| # we use process substitution so that the loop runs in the main shell. | |
| while IFS= read -r srcset; do | |
| while IFS= read -r src; do | |
| # Trim any extra whitespace. | |
| candidate=$(echo "$src" | sed 's/^[[:space:]]*//; s/[[:space:]]*$//') | |
| # Extract the descriptor | |
| descriptor="${src##* }" | |
| # Remove the descriptor to obtain the full URL. | |
| url=$(echo "$candidate" | sed -E 's/ [0-9]+[wx]$//') | |
| # Remove any query string for proper extension extraction. | |
| url_clean=${url%%\?*} | |
| extension="${url_clean##*.}" | |
| outfile="images/${base}_${image_index}_${descriptor}.${extension}" | |
| echo "Downloading $descriptor image from $base" | |
| echo "URL: $url" | |
| echo "Outfile: $outfile" | |
| echo "SRC: $src" | |
| echo " "; | |
| wget -q -O "$outfile" "$url" | |
| echo "wget done"; | |
| echo " "; | |
| done < <(echo "$srcset" | sed 's/, /\n/g') | |
| image_index=$((image_index + 1)) | |
| done < <(grep -oE 'srcset="[^"]+"' "$file" | sed 's/srcset="//; s/"$//') | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment