Created
May 4, 2023 05:26
-
-
Save pirate/c92b713750205ba674cc6156a035f75b to your computer and use it in GitHub Desktop.
Backup all Imgur URLs found in a given set of files using ripgrep + wget. Searches all text and binary files for any imgur URLs and downloads them locally.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# imgur_backup.sh | |
# Backup all Imgur URLs found in a given directory using ripgrep + wget. Searches all text files, binary files, PDFs, database dumps, etc. for any imgur URLs and downloads them into a local directory. | |
# | |
# Note: make sure you apt/brew install ripgrep first, and replace grep with ggrep (brew install grep) on macOS | |
# Usage: | |
# | |
# $ mkdir db_dumps | |
# $ docker-compose exec postgres env PGPASSWORD=somepassword pg_dump -U someuser somedb > ./db_dumps/db_dump.sql | |
# $ bash imgur_backup.sh db_dumps | |
# | |
# | |
# [1/2] Finding all the imgur URLs in ./db_dumps and saving to ./imgur | |
# | |
# 71.7KiB 0:00:01 [63.1KiB/s] [ <=> ] | |
# | |
# found URLs: 826 ./imgur/urls.txt | |
# | |
# | |
# [2/2] Downloading images that haven't been downloaded yet... | |
# | |
# ...............................................................................................................................................................................................................................................√.............⏎ | |
# | |
# $ open ./imgur/urls.txt | |
# $ open ./imgur | |
# Create the target directory for downloaded images | |
SEARCH_DIR="${1:-$PWD}" | |
IMGUR_DIR="$PWD/imgur" | |
mkdir -p $IMGUR_DIR | |
# Create a log file to record the source filename and Imgur URL found within | |
LOG_FILE="$IMGUR_DIR/urls.txt" | |
echo "" > "$LOG_FILE" | |
echo | |
echo "[1/2] Finding all the imgur URLs in $SEARCH_DIR and saving to $IMGUR_DIR" | |
echo | |
# Find all Imgur URLs, including in binary files | |
rg -a -i -o -P --no-ignore -g "!$IMGUR_DIR" \ | |
'https?://\S+\.imgur\.com/[a-zA-Z0-9]+\.[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]?' "$SEARCH_DIR" \ | |
| pv \ | |
| perl -ne \ | |
'while (/^(.*?):(.*?)$/g) { my $filename = $1; my $content = $2; while ($content =~ /(https?:\/\/[a-zA-Z0-9._-]+\.imgur\.com\/[a-zA-Z0-9]+\.[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]?)/g) { print "FILE: $filename IMG: $1\n" } }' \ | |
| grep -v '^FILE: .+ IMG: (.+)\.(xml)|(json)|(html)|(orig)$' \ | |
| uniq \ | |
> "$LOG_FILE" | |
echo | |
echo " found URLs: $(wc -l "$LOG_FILE")" | |
echo | |
echo | |
echo "[2/2] Downloading images that haven't been downloaded yet..." | |
echo | |
IFS=$'\n' | |
for line in $(sort "$LOG_FILE" | uniq); do | |
# echo "$line" | |
source_file=$(echo "$line" | perl -pe 's/^FILE: (.+) IMG: .*$/$1/') | |
imgur_url=$(echo "$line" | perl -pe 's/^FILE: .+ IMG: (.+)$/$1/') | |
# download mp4 versions of any gifv urls | |
imgur_url=$(echo "$imgur_url" | perl -pe 's/\.gifv/.mp4/gm') | |
# Download the image if it doesn't already exist | |
target_file="$IMGUR_DIR/$(basename "$imgur_url")" | |
if [ ! -f "$target_file" ]; then | |
# echo "Downloading $imgur_url to $target_file (found in $source_file)" | |
wget -c -q --output-document="$target_file" "$imgur_url" && echo -n "√" || echo -e "\n x $imgur_url (from $source_file)" | |
else | |
echo -n '.' | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment