Created
August 18, 2023 09:18
-
-
Save singe/a77a5522f776f993b7471a2e32431e73 to your computer and use it in GitHub Desktop.
A quick 'n dirty website mirror script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# A quick 'n dirty website mirror script | |
# by @singe | |
# Ideally, wget -r should mirror a site, but modern websites are complex, this | |
# tries to fix the gaps of what is typically mixed. | |
# It's been tested on 3 or 4 sites, and likely needs more tricks added. | |
sourcedomain="$1" | |
depth="$2" | |
if [[ $depth -lt 1 ]]; then | |
depth="2" | |
fi | |
# Do the first fetch of the main page | |
wget -r -N -l $depth -k --no-remove-listing https://$sourcedomain/ | |
# Look for any pesky assets that weren't fetched | |
for x in $(find $sourcedomain -type f -exec grep -o "https:\/\/$sourcedomain\/[^\'\"\ ]*" {} \;); do | |
echo Fetching $x | |
wget -r -N -l $depth -k --no-remove-listing $x | |
done | |
# Replace any references to the sourcedomain that were missed | |
LC_ALL=C find $sourcedomain -type f -exec sed -i.bak "s/https:\/\/$sourcedomain\//\//g" {} \; | |
# Remove integrity hashes for modified includes | |
LC_ALL=C find $sourcedomain -type f -exec sed -i.bak "s/integrity=[\"\'][^\"\']*[\"\']//g" {} \; | |
# Delete backup files left over from sed | |
find $sourcedomain -type f -name "*.bak" -exec rm {} \; | |
# Remove parameters from file names | |
find $sourcedomain -type f -iname "*\?*"|parallel 'a={}; b=$(echo $a|sed "s/^\([^\?]*\).*$/\1/"); mv $a $b' | |
# Run a webserver, fetch the mirror and see what assets we need to pull | |
# Run it more than once to get second order assets | |
for i in {1..$depth}; do | |
cd $sourcedomain | |
# We can't do our grep|cut here so just dump it in a file | |
python3 -m http.server 2> notfound & | |
pid="$!" | |
open http://localhost:8000/ | |
# Give time for the browser to open and fetch the page | |
sleep 4 | |
kill $pid | |
# Find what 404'ed | |
grep -o "GET \/.* 404 -" notfound|cut -d\ -f2|sort -u > fetch | |
cd .. | |
# Fetch the missing assets that 404'ed | |
for x in $(cat $sourcedomain/fetch); do | |
echo Fetching $x | |
wget -r -N -l $depth -k --no-remove-listing https://$sourcedomain$x | |
done | |
# Same as the fixups above | |
LC_ALL=C find $sourcedomain -type f -exec sed -i.bak "s/https:\/\/$sourcedomain\//\//g" {} \; | |
LC_ALL=C find $sourcedomain -type f -exec sed -i.bak "s/integrity=[\"\'][^\"\']*[\"\']//g" {} \; | |
find $sourcedomain -type f -name "*.bak" -exec rm {} \; | |
find $sourcedomain -type f -iname "*\?*"|parallel 'a={}; b=$(echo $a|sed "s/^\([^\?]*\).*$/\1/"); mv $a $b' | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment