Last active
April 29, 2021 08:31
-
-
Save davidalger/f6d2ffe440adc98ce3520d718c81ac4b to your computer and use it in GitHub Desktop.
Script to crawl and warm the cache two levels deep on Magento demo Pods via CronJob spec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -euo pipefail | |
FRONT_URL="${FRONT_URL:-https://app.exampleproject.test/}" | |
echo "==> [$(date +%H:%M:%S)] waiting on readiness" | |
ELAPSED_SECONDS=0 | |
while : ; do | |
ELAPSED_SECONDS=$(echo ${ELAPSED_SECONDS}+2 | bc) | |
RESPONSE_CODE=$(curl -sI "${FRONT_URL}" 2>/dev/null | head -n1 | awk '{print $2}') | |
if [ ${RESPONSE_CODE} -eq "200" ] || [ ${ELAPSED_SECONDS} -gt 1800 ]; then | |
break; | |
fi | |
printf "." | |
sleep 2 | |
done | |
echo | |
echo "==> [$(date +%H:%M:%S)] ${FRONT_URL}" | |
URL_LIST="$(curl -s "${FRONT_URL}" | grep -Eo 'href="[^\"]+"' \ | |
| grep -Eo '(http|https)://[^#"]+' | grep .html | sort -n | uniq)"$'\n' | |
for url in $(echo ${URL_LIST}); do | |
echo "==> [$(date +%H:%M:%S)] ${url}" | |
URL_LIST="${URL_LIST}$(curl -s "${url}" | grep -Eo 'href="[^\"]+"' \ | |
| grep -Eo '(http|https)://[^#"]+' | grep '.html$')"$'\n' | |
done | |
URL_LIST="$(echo "${URL_LIST}" | sort -n | uniq)" | |
for url in $(echo ${URL_LIST}); do | |
echo "==> [$(date +%H:%M:%S)] ${url}" | |
curl -s ${url} >/dev/null || true | |
done | |
echo "==> [$(date +%H:%M:%S)] crawl complete" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -euo pipefail | |
FRONT_URL="${FRONT_URL:-https://app.exampleproject.test/}" | |
echo "==> [$(date +%H:%M:%S)] waiting on readiness" | |
ELAPSED_SECONDS=0 | |
while : ; do | |
ELAPSED_SECONDS=$(echo ${ELAPSED_SECONDS}+2 | bc) | |
RESPONSE_CODE=$(curl -sI "${FRONT_URL}" 2>/dev/null | head -n1 | awk '{print $2}') | |
if [ ${RESPONSE_CODE} -eq "200" ] || [ ${ELAPSED_SECONDS} -gt 1800 ]; then | |
break; | |
fi | |
printf "." | |
sleep 2 | |
done | |
echo | |
echo "==> [$(date +%H:%M:%S)] ${FRONT_URL}" | |
URL_LIST="$(curl -s "${FRONT_URL}" | grep -Eo 'href="[^\"]+"' \ | |
| grep -Eo '(http|https)://[^#"]+' | grep .html | sort -n | uniq)"$'\n' | |
for url in $(echo ${URL_LIST}); do | |
echo "==> [$(date +%H:%M:%S)] ${url}" | |
URL_LIST="${URL_LIST}$(curl -s "${url}" | grep -Eo 'href="[^\"]+"' \ | |
| grep -Eo '(http|https)://[^#"]+' | grep .html | grep -E '(\?p=|\?color=|^[^\?]+$)')"$'\n' | |
done | |
URL_LIST="$(echo "${URL_LIST}" | sort -n | uniq)" | |
for url in $(echo ${URL_LIST}); do | |
echo "==> [$(date +%H:%M:%S)] ${url}" | |
IMG_LIST="$(curl -s "${url}" | grep -Eo 'src="[^\"]+"' \ | |
| grep -Eo '(http|https)://[^#"]+' | grep catalog/product/cache)"$'\n' || true | |
for url in $(echo "${IMG_LIST}" | sort -n | uniq); do | |
echo "==> [$(date +%H:%M:%S)] ${url}" | |
curl -s ${url} >/dev/null || true & | |
done | |
wait | |
done | |
echo "==> [$(date +%H:%M:%S)] crawl complete" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example ad-hoc usage: