# set site URL
## city website
site=www.cabq.gov/
url=https://"${site}"
## city data
site=data.cabq.gov/
url=http://"${site}"
# get robots.txt
wget -m "${url}"/robots.txt
# mirror website
## --save-headers \
wget \
--mirror \
--convert-links \
--execute='robots=off' \
--adjust-extension \
--page-requisites \
--no-parent \
--reject="index.html*" \
--random-wait \
"${url}" > wget.log 2>&1
Last active
November 18, 2023 09:11
-
-
Save rwcitek/e372169d2e2e96dac4b5b941431ffc55 to your computer and use it in GitHub Desktop.
City of ABQ sites
# set site URL
site=www.cabq.gov/
url=https://"${site}"
# get robots.txt
wget -m "${url}"/robots.txt
# get the sitemap as XML
sed -ne '/^Sitemap:/ { s/^[^:]*: //p }' "${site}"/robots.txt |
tr -s '\r\n' '\n' |
xargs wget -m
mkdir -p sitemap."${site}"
zcat "${site}"/sitemap.xml*.gz |
sed -nre '/<loc>/ { s/<.{3,8}>//g ; p }' |
xargs -n 1 curl -s |
zcat > sitemap."${site}"/sitemap.xml
# create a list of URLs from the sitemap
cat sitemap."${site}"/sitemap.xml |
sed -nre '/<\/?l.*>/ { s/ *<.{3,8}>//g ; p }' |
paste - - |
awk -F'\t' '{print $2 "\t" $1}' > sitemap."${site}"/sitemap.tsv
# create folders
cut -f2 sitemap."${site}"/sitemap.tsv |
cut -d/ -f3- |
rev |
cut -d/ -f2- |
rev |
sed 's/^/sitemap./' |
sort |
uniq |
xargs mkdir -p
# create commands to fetch HEAD info and run commands in parallel
cut -f2 sitemap."${site}"/sitemap.tsv |
awk '{print "curl -s -I " $1 " > " $1 ".HEAD"}' |
sed 's#https://#sitemap.#2' |
xargs -P 10 -I{} -t bash -c "{}" > ./site.log 2> ./site.error &
# traverse sitemap tree
find sitemap."${site}" -type f -name '*.HEAD' | xargs tail -n +1 | more
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment