Created
June 12, 2020 05:33
-
-
Save KalleVuorjoki/123b264f6ce62f5ecb8907795c240bde to your computer and use it in GitHub Desktop.
Script to crawl Drupal JSON:API endpoint to local filesystem.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## | |
# Script to crawl Drupal JSON:API endpoint to local filesystem. | |
# | |
# Assumes endpoint to be /jsonapi and uses basic auth. | |
# | |
# Requirements: | |
# ag - https://github.com/ggreer/the_silver_searcher | |
# jq - https://github.com/stedolan/jq | |
## | |
echo "In order to run this script you need to change REPLACE_DOMAIN to your domain" | |
API_ROOT_URL="http://REPLACE_DOMAIN/jsonapi" | |
BASIC_AUTH=("username:password") | |
OUT_DIR=`pwd` | |
# $1 = Absolute url | |
# $2 = relative path | |
function download_json { | |
echo "Creating directory content$2" | |
mkdir -p content$2 | |
echo "Downloading $1 to $OUT_DIR$2/index.json" | |
curl -u "$BASIC_AUTH" -sS $1 | jq . > content$2/index.json | |
} | |
# Get the root index first. | |
download_json $API_ROOT_URL /index | |
jq -r '.links[].href' content/index/index.json | while read -r API_PATH; do | |
RELATIVE_PATH=${API_PATH#"$API_ROOT_URL"} | |
download_json $API_PATH $RELATIVE_PATH | |
done | |
# @todo Loop for paginated content | |
# Finally fetch all content. | |
# @todo Do not hardcode domain. | |
ag --nofilename --no-numbers --ignore-case --only-matching --silent 'http://REPLACE_DOMAIN/jsonapi[^"]+' content | while read -r VALUE ; do | |
if [ -z "$VALUE" ] | |
then | |
# Do nothing, because value is empty. | |
echo "\$VALUE is empty" | |
else | |
RELATIVE_PATH=${VALUE#"$API_ROOT_URL"} | |
download_json $VALUE $RELATIVE_PATH | |
fi | |
done | |
FILE_COUNT=$(ls -Rs content |wc -l) | |
echo "Downloaded totally $FILE_COUNT files" | |
# @todo Do not hardcode domains. | |
ag -l 'REPLACE_DOMAIN' . | xargs sed -i '' 's/REPLACE_DOMAIN/localhost\:3000/g' | |
echo "All done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment