pirate · May 4, 2023 05:26
diff --git a/imgur_backup.sh b/imgur_backup.sh
 #!/usr/bin/env bash
 # imgur_backup.sh
 # Backup all Imgur URLs found in a given directory using ripgrep + wget. Searches all text files, binary files, PDFs, database dumps, etc. for any imgur URLs and downloads them into a local directory.
 #
 # Note: make sure you apt/brew install ripgrep first, and replace grep with ggrep (brew install grep) on macOS
 # Usage:
 #
 #     $ mkdir db_dumps
 #     $ docker-compose exec postgres env PGPASSWORD=somepassword pg_dump -U someuser somedb > ./db_dumps/db_dump.sql
 #     $ bash imgur_backup.sh db_dumps
 #     
 #     
 #     [1/2] Finding all the imgur URLs in ./db_dumps and saving to ./imgur
 #     
 #     71.7KiB 0:00:01 [63.1KiB/s] [        <=>                                                                                                                                                                                                                      ]
 #     
 #         found URLs:      826 ./imgur/urls.txt
 #     
 #     
 #     [2/2] Downloading images that haven't been downloaded yet...
 #     
 #     ...............................................................................................................................................................................................................................................√.............⏎
 #
 #     $ open ./imgur/urls.txt
 #     $ open ./imgur



 # Create the target directory for downloaded images
 SEARCH_DIR="${1:-$PWD}"
 IMGUR_DIR="$PWD/imgur"
 mkdir -p $IMGUR_DIR

 # Create a log file to record the source filename and Imgur URL found within
 LOG_FILE="$IMGUR_DIR/urls.txt"
 echo "" > "$LOG_FILE"

 echo
 echo "[1/2] Finding all the imgur URLs in $SEARCH_DIR and saving to $IMGUR_DIR"
 echo

 # Find all Imgur URLs, including in binary files
 rg -a -i -o -P --no-ignore -g "!$IMGUR_DIR" \
  'https?://\S+\.imgur\.com/[a-zA-Z0-9]+\.[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]?' "$SEARCH_DIR" \
  | pv \
  | perl -ne \
    'while (/^(.*?):(.*?)$/g) { my $filename = $1; my $content = $2; while ($content =~ /(https?:\/\/[a-zA-Z0-9._-]+\.imgur\.com\/[a-zA-Z0-9]+\.[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]?)/g) { print "FILE: $filename       IMG: $1\n" } }' \
  | grep -v '^FILE: .+       IMG: (.+)\.(xml)|(json)|(html)|(orig)$' \
  | uniq \
  > "$LOG_FILE"

 echo
 echo "    found URLs: $(wc -l "$LOG_FILE")"
 echo

 echo
 echo "[2/2] Downloading images that haven't been downloaded yet..."
 echo


 IFS=$'\n'
 for line in $(sort "$LOG_FILE" | uniq); do
  
  # echo "$line"
  source_file=$(echo "$line" | perl -pe 's/^FILE: (.+)       IMG: .*$/$1/')
  imgur_url=$(echo "$line" | perl -pe 's/^FILE: .+       IMG: (.+)$/$1/')

  # download mp4 versions of any gifv urls
  imgur_url=$(echo "$imgur_url" | perl -pe 's/\.gifv/.mp4/gm')

  # Download the image if it doesn't already exist
  target_file="$IMGUR_DIR/$(basename "$imgur_url")"

  if [ ! -f "$target_file" ]; then
    # echo "Downloading $imgur_url to $target_file   (found in $source_file)"
    wget -c -q --output-document="$target_file" "$imgur_url" && echo -n "√" || echo -e "\n x $imgur_url (from $source_file)"
  else
    echo -n '.'
  fi
 done
	#!/usr/bin/env bash
	# imgur_backup.sh
	# Backup all Imgur URLs found in a given directory using ripgrep + wget. Searches all text files, binary files, PDFs, database dumps, etc. for any imgur URLs and downloads them into a local directory.
	#
	# Note: make sure you apt/brew install ripgrep first, and replace grep with ggrep (brew install grep) on macOS
	# Usage:
	#
	# $ mkdir db_dumps
	# $ docker-compose exec postgres env PGPASSWORD=somepassword pg_dump -U someuser somedb > ./db_dumps/db_dump.sql
	# $ bash imgur_backup.sh db_dumps
	#
	#
	# [1/2] Finding all the imgur URLs in ./db_dumps and saving to ./imgur
	#
	# 71.7KiB 0:00:01 [63.1KiB/s] [ <=> ]
	#
	# found URLs: 826 ./imgur/urls.txt
	#
	#
	# [2/2] Downloading images that haven't been downloaded yet...
	#
	# ...............................................................................................................................................................................................................................................√.............⏎
	#
	# $ open ./imgur/urls.txt
	# $ open ./imgur



	# Create the target directory for downloaded images
	SEARCH_DIR="${1:-$PWD}"
	IMGUR_DIR="$PWD/imgur"
	mkdir -p $IMGUR_DIR

	# Create a log file to record the source filename and Imgur URL found within
	LOG_FILE="$IMGUR_DIR/urls.txt"
	echo "" > "$LOG_FILE"

	echo
	echo "[1/2] Finding all the imgur URLs in $SEARCH_DIR and saving to $IMGUR_DIR"
	echo

	# Find all Imgur URLs, including in binary files
	rg -a -i -o -P --no-ignore -g "!$IMGUR_DIR" \
	'https?://\S+\.imgur\.com/[a-zA-Z0-9]+\.[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]?' "$SEARCH_DIR" \
	\| pv \
	\| perl -ne \
	'while (/^(.?):(.?)$/g) { my $filename = $1; my $content = $2; while ($content =~ /(https?:\/\/[a-zA-Z0-9._-]+\.imgur\.com\/[a-zA-Z0-9]+\.[a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9][a-zA-Z0-9]?)/g) { print "FILE: $filename IMG: $1\n" } }' \
	\| grep -v '^FILE: .+ IMG: (.+)\.(xml)\|(json)\|(html)\|(orig)$' \
	\| uniq \
	> "$LOG_FILE"

	echo
	echo " found URLs: $(wc -l "$LOG_FILE")"
	echo

	echo
	echo "[2/2] Downloading images that haven't been downloaded yet..."
	echo


	IFS=$'\n'
	for line in $(sort "$LOG_FILE" \| uniq); do

	# echo "$line"
	source_file=$(echo "$line" \| perl -pe 's/^FILE: (.+) IMG: .*$/$1/')
	imgur_url=$(echo "$line" \| perl -pe 's/^FILE: .+ IMG: (.+)$/$1/')

	# download mp4 versions of any gifv urls
	imgur_url=$(echo "$imgur_url" \| perl -pe 's/\.gifv/.mp4/gm')

	# Download the image if it doesn't already exist
	target_file="$IMGUR_DIR/$(basename "$imgur_url")"

	if [ ! -f "$target_file" ]; then
	# echo "Downloading $imgur_url to $target_file (found in $source_file)"
	wget -c -q --output-document="$target_file" "$imgur_url" && echo -n "√" \|\| echo -e "\n x $imgur_url (from $source_file)"
	else
	echo -n '.'
	fi
	done