# set site URL ## city website site=www.cabq.gov/ url=https://"${site}" ## city data site=data.cabq.gov/ url=http://"${site}" # get robots.txt wget -m "${url}"/robots.txt # mirror website ## --save-headers \ wget \ --mirror \ --convert-links \ --execute='robots=off' \ --adjust-extension \ --page-requisites \ --no-parent \ --reject="index.html*" \ --random-wait \ "${url}" > wget.log 2>&1

Get HEAD information from the files in the sitemap file

# set site URL
site=www.cabq.gov/
url=https://"${site}"

# get robots.txt
wget -m "${url}"/robots.txt

# get the sitemap as XML
sed -ne '/^Sitemap:/ { s/^[^:]*: //p }' "${site}"/robots.txt |
tr -s '\r\n' '\n' |
xargs wget -m 

mkdir -p sitemap."${site}"
zcat "${site}"/sitemap.xml*.gz |
sed -nre '/<loc>/ { s/<.{3,8}>//g ; p }' |
xargs -n 1 curl -s |
zcat > sitemap."${site}"/sitemap.xml


# create a list of URLs from the sitemap
cat sitemap."${site}"/sitemap.xml |
sed -nre '/<\/?l.*>/ { s/ *<.{3,8}>//g ; p }' |
paste - - |
awk -F'\t' '{print $2 "\t" $1}' > sitemap."${site}"/sitemap.tsv


# create folders
cut -f2 sitemap."${site}"/sitemap.tsv |
cut -d/ -f3- |
rev |
cut -d/ -f2- |
rev |
sed 's/^/sitemap./' |
sort |
uniq |
xargs mkdir -p 


# create commands to fetch HEAD info and run commands in parallel
cut -f2 sitemap."${site}"/sitemap.tsv |
awk '{print "curl -s -I " $1 " > " $1 ".HEAD"}' |
sed 's#https://#sitemap.#2' |
xargs -P 10 -I{} -t bash -c "{}" > ./site.log 2> ./site.error &


# traverse sitemap tree
find sitemap."${site}" -type f -name '*.HEAD' | xargs tail -n +1 | more

rwcitek/cabq.data.mirror.md

Mirror the Albuquerque City or City data sites

Get HEAD information from the files in the sitemap file