Created
March 27, 2023 18:54
-
-
Save jeremydw/13f9c596c25b1a505414f8c85d8e1abf to your computer and use it in GitHub Desktop.
Download all JSON files from a page and reupload them to GCS with compression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -e | |
# Reuploads files to Google Cloud Storage and adds gzip compression and 60 | |
# second cache headers. | |
# | |
# Usage: | |
# ./urls.sh https://sites.research.google/relate/ > urls.txt | |
# ./reupload.sh urls.txt | |
function get_gs_url() { | |
gs_url="gs://$(echo "$1" | sed 's|https://storage.googleapis.com/||')" | |
echo "$gs_url" | |
} | |
function download_and_gzip() { | |
local url="$1" | |
local filename=$(basename "$url") | |
local temp_filename="$filename.tmp" | |
# Download file | |
gsutil cp "$(get_gs_url "$url")" "$temp_filename" | |
echo "Source URL: $url" | |
echo "Destination URL: $(get_gs_url "$url")" | |
# Upload compressed file to the same location | |
gsutil -m \ | |
-h "Cache-Control:public, max-age=60" \ | |
-h "Content-Type: application/json" \ | |
cp -a public-read -Z "$temp_filename" "$(get_gs_url "$url")" | |
# Clean up temporary file | |
rm "$temp_filename" | |
} | |
# Loop through each URL and download and compress the file | |
while read -r url; do | |
# Only process URLs that start with https://storage.googleapis.com/ | |
if [[ "$url" == "https://storage.googleapis.com/"* ]]; then | |
download_and_gzip "$url" | |
fi | |
done < "$1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -e | |
# Outputs all resources that end in .json from a given page. | |
# | |
# Usage: | |
# ./urls.sh [url] | |
# | |
# Example: | |
# ./urls.sh https://sites.research.google/relate/ | |
response=$(curl -s "$1") | |
json_urls=$(echo "$response" | grep -oE '(http[^[:space:]]*\.json)') | |
echo "$json_urls" | sort | uniq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment