IGLOU-EU · December 19, 2024 01:12
diff --git a/midjourney_backup.sh b/midjourney_backup.sh
 #!/bin/bash

 ## LICENCE ##
 # Copyright (C) 2024  Kara Adrien

 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.

 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.

 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

 ## DOCUMENTATION ##
 # This script automates the backup of your Midjourney archives by downloading
 # JSON data and associated images.
 #
 # It should work on macOS and Windows using a Linux-like terminal (e.g. WSL)
 # but it hasn't been tested on these platforms.
 #
 # NOTE: The script can take a significant amount of time to complete, depending on the number of images to download.
 #
 # REQUIREMENTS:
 # - Bash shell
 # - curl command-line tool
 # - jq for processing JSON data
 # - awk for generating random sleep intervals
 #
 # SPEC:
 # - Includes a random sleep interval to avoid being blocked by the server.
 # - Supports resuming by skipping already downloaded images.
 # - Handles different job types, including upscaler jobs.
 #
 # USAGE INSTRUCTIONS:
 # 1. Download the script to your local machine.
 # 2. Set your configuration (Look at the next section)
 # 3. Set the root directory for backups in DIR_ROOT if needed.
 # 4. Run the script: ./backup.sh
 #

 ## SET YOUR CONFIGURATION ##

 # Go to https://www.midjourney.com/profile-settings
 # And copy the "Midjourney ID" into USER_ID
 USER_ID=''

 # To retrieve cookies:
 # Note: The process should be similar for Chromium-based browsers.
 #
 # 1. Log in to Midjourney and navigate to the archives page: https://www.midjourney.com/archive
 # 2. Right-click on the page and select "Inspect" to open Developer Tools.
 # 3. Go to the "Storage" tab.
 # 4. Find and copy the values of "__Host-Midjourney.AuthUserTokenV3_i" 
 #    and "__Host-Midjourney.AuthUserTokenV3_r".
 # 5. Paste these values into the USER_COOKIES_AuthUserTokenV3_i and 
 #    USER_COOKIES_AuthUserTokenV3_r variables respectively.
 USER_COOKIES_AuthUserTokenV3_i=''
 USER_COOKIES_AuthUserTokenV3_r=''

 # Set the root directory for the backup
 DIR_ROOT="."

 # Define the directory for JSON files within the root directory
 DIR_JSON="${DIR_ROOT}/prompt"

 # Define the directory for image files within the root directory
 DIR_IMAGES="${DIR_ROOT}/images"


 ## THIS IS THE SOFTWARE, DON'T TOUCH IT ##
 set -e

 function build_url() {
    local cursor="$1"

    if [[ -n $cursor ]]; then
        cursor="&cursor=${cursor}"
    fi

    echo "https://www.midjourney.com/api/pg/thomas-jobs?user_id=${USER_ID}&page_size=1000${cursor}"
 }

 function dl_dataset() {
  local out="$1"
  local url="$2"

  if ! curl "$url" --compressed \
    -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0' \
    -H 'Accept: */*' \
    -H 'Accept-Language: fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3' \
    -H 'Accept-Encoding: gzip, deflate' \
    -H 'Referer: https://www.midjourney.com/archive' \
    -H 'Content-Type: application/json' \
    -H 'X-CSRF-Protection: 1' \
    -H 'DNT: 1' \
    -H 'Sec-GPC: 1' \
    -H 'Alt-Used: www.midjourney.com' \
    -H 'Connection: keep-alive' \
    -H "Cookie: __Host-Midjourney.AuthUserTokenV3_i=${USER_COOKIES_AuthUserTokenV3_i}; __Host-Midjourney.AuthUserTokenV3_r=${USER_COOKIES_AuthUserTokenV3_r}" \
    -H 'Sec-Fetch-Dest: empty' \
    -H 'Sec-Fetch-Mode: no-cors' \
    -H 'Sec-Fetch-Site: same-origin' \
    -H 'Priority: u=4' \
    -H 'Pragma: no-cache' \
    -H 'Cache-Control: no-cache' \
    -H 'TE: trailers' \
    -o "$out"; then
      echo "❌ Failed to download dataset from $url" >&2
      exit 1
  fi
 }

 function dl_image() {
    local out="$1"
    local url="$2"

    if [ -e "$out" ]; then
        echo "=> Already downloaded $out"
        return
    fi

    echo "=> Download  $url"
    sleep "$(awk -v min=0.1 -v max=3.0 'BEGIN{srand(); print min+rand()*(max-min)}')"
    
    if ! curl -s --fail "$url" \
      -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0' \
      -H 'Referer: https://www.midjourney.com/archive' \
      -H 'X-CSRF-Protection: 1' \
      -H 'DNT: 1' \
      -H 'Sec-GPC: 1' \
      -H 'Alt-Used: www.midjourney.com' \
      -H 'Connection: keep-alive' \
      -H 'Sec-Fetch-Dest: empty' \
      -H 'Sec-Fetch-Mode: no-cors' \
      -H 'Sec-Fetch-Site: same-origin' \
      -H 'Priority: u=4' \
      -H 'Pragma: no-cache' \
      -H 'Cache-Control: no-cache' \
      -H 'TE: trailers' \
      -o "$out"; then
        echo "❌ Failed to download image from $url" >&2
        rm -f "$out"
    fi
 }

 function dl_images() {
    local file="$1"

    jq -c '.data[]' "$file" | while read -r item; do
        local item_id
        item_id="$(echo "$item" | jq -r '.id')"
        echo "[$item_id]"

        local job_type
        job_type="$(echo "$item" | jq -r '.job_type')"

        local enqueue_time
        enqueue_time="$(echo "$item" | jq -r '.enqueue_time')"
        local unix_time
        unix_time=$(date -d "$enqueue_time" +%s)

        local batch_size
        batch_size="$(echo "$item" | jq -r '.batch_size')"

        local parent_id parent_grid
        parent_id="$(echo "$item" | jq -r '.parent_id')"
        parent_grid="$(echo "$item" | jq -r '.parent_grid')"

        if [ -n "$parent_id" ] && [ "$parent_id" != "null" ] && [ -n "$parent_grid" ] && [ "$parent_grid" != "null" ]; then
            image_url="https://cdn.midjourney.com/${parent_id}/0_${parent_grid}.png"
            output_image="${DIR_IMAGES}/parent_${parent_id}_${parent_grid}.png"
            
            echo "==> [Parent]> "
            dl_image "$output_image" "$image_url"
            echo "<== <[Parent] "
        fi

        case $job_type in
            *_virtual_*)
                echo "=> Skip virtual job $item_id - $job_type"
                continue
                ;;
        esac

        for batch in $(seq 0 "$((batch_size - 1))"); do
            image_url="https://cdn.midjourney.com/${item_id}/0_${batch}.png"
            output_image="${DIR_IMAGES}/${unix_time}_${item_id}_${batch}.png"
            dl_image "$output_image" "$image_url" &
        done

        wait
    done
 }

 required_commands=("jq" "magick" "curl" "awk")
 missing=()

 for cmd in "${required_commands[@]}"; do
    if ! command -v "$cmd" &>/dev/null; then
        missing+=("$cmd")
    fi
 done

 if [ ${#missing[@]} -ne 0 ]; then
    echo "❌ The following required commands are not installed:" >&2
    for cmd in "${missing[@]}"; do
        echo " - $cmd"
    done
    echo "Please install them and try again." >&2
    exit 1
 fi

 if [ -z "$USER_ID" ] || [ -z "$USER_COOKIES_AuthUserTokenV3_i" ] || [ -z "$USER_COOKIES_AuthUserTokenV3_r" ]; then
    echo "😱 Hoho, you forgot to configure something important!" >&2
    echo "⚠️  Read the instruction at the top of this script." >&2
    exit 1
 fi

 if ! mkdir -p "$DIR_JSON" "$DIR_IMAGES"; then
    echo "❌ Error: Failed to create directories." >&2
    exit 1
 fi

 nb=1
 cursor="first"
 while [ "$cursor" != "" ]; do
    if [[ $cursor == "first" ]]; then
        cursor=""
    fi

    url="$(build_url "$cursor")"
    file="${DIR_JSON}/user_${USER_ID}-ID${nb}.json"
    dl_dataset "$file" "$url"

    if ! cursor="$(jq -r '.cursor // empty' "$file")"; then
        echo "❌ Error: Outch, an error with json file, that bad, try again latter." >&2
        break
    fi

    dl_images "$file"

    ((nb++))
 done

 echo "Checking if there is corrupted images"

 nb=0
 while read -r img; do
    if ! magick "$img" null: 2>/dev/null; then
        echo "=> Removal of a likely corrupted image $(basename "$img")"
        rm "$img"
        ((nb++))
    fi
 done <<<"$(find "$DIR_IMAGES" -type f)"

 cat <<EOF

 🎉 You have successfully downloaded $(find "$DIR_IMAGES" -type f | wc -l) images! 📸  
 🗂️  For a total size of $(du -sh "$DIR_IMAGES" | cut -f1) 💾  
 💥 And $nb images as failed the integrity check 💣  

 🔄 You can run the script as many times as you want to retry.

 EOF
	#!/bin/bash

	## LICENCE ##
	# Copyright (C) 2024 Kara Adrien

	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.

	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.

	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/>.

	## DOCUMENTATION ##
	# This script automates the backup of your Midjourney archives by downloading
	# JSON data and associated images.
	#
	# It should work on macOS and Windows using a Linux-like terminal (e.g. WSL)
	# but it hasn't been tested on these platforms.
	#
	# NOTE: The script can take a significant amount of time to complete, depending on the number of images to download.
	#
	# REQUIREMENTS:
	# - Bash shell
	# - curl command-line tool
	# - jq for processing JSON data
	# - awk for generating random sleep intervals
	#
	# SPEC:
	# - Includes a random sleep interval to avoid being blocked by the server.
	# - Supports resuming by skipping already downloaded images.
	# - Handles different job types, including upscaler jobs.
	#
	# USAGE INSTRUCTIONS:
	# 1. Download the script to your local machine.
	# 2. Set your configuration (Look at the next section)
	# 3. Set the root directory for backups in DIR_ROOT if needed.
	# 4. Run the script: ./backup.sh
	#

	## SET YOUR CONFIGURATION ##

	# Go to https://www.midjourney.com/profile-settings
	# And copy the "Midjourney ID" into USER_ID
	USER_ID=''

	# To retrieve cookies:
	# Note: The process should be similar for Chromium-based browsers.
	#
	# 1. Log in to Midjourney and navigate to the archives page: https://www.midjourney.com/archive
	# 2. Right-click on the page and select "Inspect" to open Developer Tools.
	# 3. Go to the "Storage" tab.
	# 4. Find and copy the values of "__Host-Midjourney.AuthUserTokenV3_i"
	# and "__Host-Midjourney.AuthUserTokenV3_r".
	# 5. Paste these values into the USER_COOKIES_AuthUserTokenV3_i and
	# USER_COOKIES_AuthUserTokenV3_r variables respectively.
	USER_COOKIES_AuthUserTokenV3_i=''
	USER_COOKIES_AuthUserTokenV3_r=''

	# Set the root directory for the backup
	DIR_ROOT="."

	# Define the directory for JSON files within the root directory
	DIR_JSON="${DIR_ROOT}/prompt"

	# Define the directory for image files within the root directory
	DIR_IMAGES="${DIR_ROOT}/images"


	## THIS IS THE SOFTWARE, DON'T TOUCH IT ##
	set -e

	function build_url() {
	local cursor="$1"

	if [[ -n $cursor ]]; then
	cursor="&cursor=${cursor}"
	fi

	echo "https://www.midjourney.com/api/pg/thomas-jobs?user_id=${USER_ID}&page_size=1000${cursor}"
	}

	function dl_dataset() {
	local out="$1"
	local url="$2"

	if ! curl "$url" --compressed \
	-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0' \
	-H 'Accept: /' \
	-H 'Accept-Language: fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3' \
	-H 'Accept-Encoding: gzip, deflate' \
	-H 'Referer: https://www.midjourney.com/archive' \
	-H 'Content-Type: application/json' \
	-H 'X-CSRF-Protection: 1' \
	-H 'DNT: 1' \
	-H 'Sec-GPC: 1' \
	-H 'Alt-Used: www.midjourney.com' \
	-H 'Connection: keep-alive' \
	-H "Cookie: __Host-Midjourney.AuthUserTokenV3_i=${USER_COOKIES_AuthUserTokenV3_i}; __Host-Midjourney.AuthUserTokenV3_r=${USER_COOKIES_AuthUserTokenV3_r}" \
	-H 'Sec-Fetch-Dest: empty' \
	-H 'Sec-Fetch-Mode: no-cors' \
	-H 'Sec-Fetch-Site: same-origin' \
	-H 'Priority: u=4' \
	-H 'Pragma: no-cache' \
	-H 'Cache-Control: no-cache' \
	-H 'TE: trailers' \
	-o "$out"; then
	echo "❌ Failed to download dataset from $url" >&2
	exit 1
	fi
	}

	function dl_image() {
	local out="$1"
	local url="$2"

	if [ -e "$out" ]; then
	echo "=> Already downloaded $out"
	return
	fi

	echo "=> Download $url"
	sleep "$(awk -v min=0.1 -v max=3.0 'BEGIN{srand(); print min+rand()*(max-min)}')"

	if ! curl -s --fail "$url" \
	-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0' \
	-H 'Referer: https://www.midjourney.com/archive' \
	-H 'X-CSRF-Protection: 1' \
	-H 'DNT: 1' \
	-H 'Sec-GPC: 1' \
	-H 'Alt-Used: www.midjourney.com' \
	-H 'Connection: keep-alive' \
	-H 'Sec-Fetch-Dest: empty' \
	-H 'Sec-Fetch-Mode: no-cors' \
	-H 'Sec-Fetch-Site: same-origin' \
	-H 'Priority: u=4' \
	-H 'Pragma: no-cache' \
	-H 'Cache-Control: no-cache' \
	-H 'TE: trailers' \
	-o "$out"; then
	echo "❌ Failed to download image from $url" >&2
	rm -f "$out"
	fi
	}

	function dl_images() {
	local file="$1"

	jq -c '.data[]' "$file" \| while read -r item; do
	local item_id
	item_id="$(echo "$item" \| jq -r '.id')"
	echo "[$item_id]"

	local job_type
	job_type="$(echo "$item" \| jq -r '.job_type')"

	local enqueue_time
	enqueue_time="$(echo "$item" \| jq -r '.enqueue_time')"
	local unix_time
	unix_time=$(date -d "$enqueue_time" +%s)

	local batch_size
	batch_size="$(echo "$item" \| jq -r '.batch_size')"

	local parent_id parent_grid
	parent_id="$(echo "$item" \| jq -r '.parent_id')"
	parent_grid="$(echo "$item" \| jq -r '.parent_grid')"

	if [ -n "$parent_id" ] && [ "$parent_id" != "null" ] && [ -n "$parent_grid" ] && [ "$parent_grid" != "null" ]; then
	image_url="https://cdn.midjourney.com/${parent_id}/0_${parent_grid}.png"
	output_image="${DIR_IMAGES}/parent_${parent_id}_${parent_grid}.png"

	echo "==> [Parent]> "
	dl_image "$output_image" "$image_url"
	echo "<== <[Parent] "
	fi

	case $job_type in
	_virtual_)
	echo "=> Skip virtual job $item_id - $job_type"
	continue
	;;
	esac

	for batch in $(seq 0 "$((batch_size - 1))"); do
	image_url="https://cdn.midjourney.com/${item_id}/0_${batch}.png"
	output_image="${DIR_IMAGES}/${unix_time}_${item_id}_${batch}.png"
	dl_image "$output_image" "$image_url" &
	done

	wait
	done
	}

	required_commands=("jq" "magick" "curl" "awk")
	missing=()

	for cmd in "${required_commands[@]}"; do
	if ! command -v "$cmd" &>/dev/null; then
	missing+=("$cmd")
	fi
	done

	if [ ${#missing[@]} -ne 0 ]; then
	echo "❌ The following required commands are not installed:" >&2
	for cmd in "${missing[@]}"; do
	echo " - $cmd"
	done
	echo "Please install them and try again." >&2
	exit 1
	fi

	if [ -z "$USER_ID" ] \|\| [ -z "$USER_COOKIES_AuthUserTokenV3_i" ] \|\| [ -z "$USER_COOKIES_AuthUserTokenV3_r" ]; then
	echo "😱 Hoho, you forgot to configure something important!" >&2
	echo "⚠️ Read the instruction at the top of this script." >&2
	exit 1
	fi

	if ! mkdir -p "$DIR_JSON" "$DIR_IMAGES"; then
	echo "❌ Error: Failed to create directories." >&2
	exit 1
	fi

	nb=1
	cursor="first"
	while [ "$cursor" != "" ]; do
	if [[ $cursor == "first" ]]; then
	cursor=""
	fi

	url="$(build_url "$cursor")"
	file="${DIR_JSON}/user_${USER_ID}-ID${nb}.json"
	dl_dataset "$file" "$url"

	if ! cursor="$(jq -r '.cursor // empty' "$file")"; then
	echo "❌ Error: Outch, an error with json file, that bad, try again latter." >&2
	break
	fi

	dl_images "$file"

	((nb++))
	done

	echo "Checking if there is corrupted images"

	nb=0
	while read -r img; do
	if ! magick "$img" null: 2>/dev/null; then
	echo "=> Removal of a likely corrupted image $(basename "$img")"
	rm "$img"
	((nb++))
	fi
	done <<<"$(find "$DIR_IMAGES" -type f)"

	cat <<EOF

	🎉 You have successfully downloaded $(find "$DIR_IMAGES" -type f \| wc -l) images! 📸
	🗂️ For a total size of $(du -sh "$DIR_IMAGES" \| cut -f1) 💾
	💥 And $nb images as failed the integrity check 💣

	🔄 You can run the script as many times as you want to retry.

	EOF
No results found