earthbound19 · January 26, 2022 18:13 · earthbound19 · Jan 26, 2022 · earthbound19 · Jan 26, 2022
diff --git a/wget_all_Humanae_portraits_Tumblr.sh b/wget_all_Humanae_portraits_Tumblr.sh
 # DESCRIPTION
 # Retrieves all the HTML for all active state archives of the Humanae project (at Tumblr) which the Wayback Machine archived until 2019-07-26, parses the large portrait JPG URLs out of all that, and retrieves all the large JPGs, in a folder. Result is 3,326 portrait images. At this writing, whether any of those are duplicates has not been determined. The JPG URLs parsed out of the HTML source files have some identical file names at different URLs, and partial analysis suggests they are all in fact the same files at different web locations in Tumblr. Also, it happens at this writing that all the image URLs are still live, although the links to them at the Tumblr blog are down. Pulling images out of the Wayback machine, if that ever becomes necessary, might be more difficult.

 # DEPENDENCIES
 # Ruby, the wayback_machine_downloader gem, and (if you're on Windows) MSYS2. On other platforms, a Bash environment with what GNU/Linux core utils this scirpt uses. Maybe other things that Ruby may need on platforms other than Windows.

 # USAGE
 # Install the necessary dependencies and run this script from a Bash/MSYS2 environment.
 # To bypass prompts to wipe / recreate target direcotires, pass any parameter to the script:
 #    get_Humanae_large_JPGs_from_Wayback_Machine.sh FOO
 # To run normally, run without any parameter:
 #    get_Humanae_large_JPGs_from_Wayback_Machine.sh
 # Intermediary HTML is placed in a new ./wayback_machine_html folder. Final JPG collection is placed in a _recovered_Humanae_Tumblr_large_jpgs folder.


 # CODE
 if [ -d wayback_machine_html ] && [ ! "$1" ]
 then
 	read -p "Wayback_machine_html directory already exists. Wipe it and recreate it (Y/N)?" USERINPUT
 	if [ "$USERINPUT" == "Y" ] || [ "$USERINPUT" == "y" ]
 	then
 		rm -rf wayback_machine_html
 	fi
 fi
 # Create expected subdir if doesn't exist.
 if [ ! -d wayback_machine_html ]
 then
 	mkdir wayback_machine_html
 fi

 # Retrieve all archives from start date to date of the last snapshot before the Tumblr blog's pages were remove:
 wayback_machine_downloader humanae.tumblr.com -t 20190726 -f 20120607 -d wayback_machine_html

 cd wayback_machine_html
 allFileNamesArray=( $(find . -type f -iname "*.*") )

 echo "Beginning parsing of files from wayback machine archives for jpg image URLs . ."
 # example commands that work to filter jpg URLs out of a file:
 #    tr ' ' '\n' < index.html > tmp.txt
 #    grep -o -h "http[^\"}{]*.jpg" tmp.txt
 # adapting those example commands:
 printf "" > ../tmp_url_parsing_25eEK75FJ.txt
 print "" > ../all_jpgs.txt
 for fileName in ${allFileNamesArray[@]}
 do
 	echo parsing file $fileName . . .
 	tr ' ' '\n' < $fileName > ../tmp_url_parsing_25eEK75FJ.txt
 	# this regex gets all jpgs:
 	# grep -o -h "http[^\"}{]*\.jpg" ../tmp_url_parsing_25eEK75FJ.txt >> ../all_jpgs.txt
 	# -- but we only want the large jpgs, which all end with *._1280.jpg; which this gets:
 	grep -o -h "http[^\"}{]*\_1280.jpg" ../tmp_url_parsing_25eEK75FJ.txt >> ../all_large_jpgs.txt
 done

 rm ../tmp_url_parsing_25eEK75FJ.txt
 cd ..

 echo "DONE extracting .jpg URLs. They are all in all_large_jpgs.txt. Deduplicating that . . ."

 lines=($(<all_large_jpgs.txt))
 OIFS="$IFS"
 IFS=$'\n'
 lines=($(sort <<<"${lines[*]}"))
 lines=($(uniq <<<"${lines[*]}"))
 OIFS="$IFS"
 printf '%s\n' "${lines[@]}" > all_large_jpgs.txt

 echo "DONE deduplicating all_large_jpgs.txt."

 if [ -d _collected_jpgs ] && [ ! "$1" ]
 then
 	read -p "_collected_jpgs directory already exists. Wipe it and recreate it (Y/N)?" USERINPUT
 	if [ "$USERINPUT" == "Y" ] || [ "$USERINPUT" == "y" ]
 	then
 		rm -rf _collected_jpgs
 	fi
 fi

 if [ ! -d _collected_jpgs ]
 then
 	mkdir _collected_jpgs
 fi

 echo "Will now retrieve all images from that list, and skip images with duplicate file names . . ."
 allJPGurls=( $(<all_large_jpgs.txt) )
 for jpgURL in ${allJPGurls[@]}
 do
 	filenameNoPath=${jpgURL##*/}
 	if [ ! -f ./_collected_jpgs/$filenameNoPath ]
 	then
 		echo retrieving $jpgURL . . .
 		wget $jpgURL
 		mv "$filenameNoPath" ./_collected_jpgs/"$possibleDuplicateFileTag""$filenameNoPath"
 	else
 		echo "Will not re-retrieve nor clobber target file ./_collected_jpgs/$filenameNoPath, which already exists. Skip."
 	fi
 done

 echo "DONE. Collected jpgs are in the ./_collected_jpgs directory. RESPECT THE COPYRIGHT OWNER and only do things like post shrunk (fair use) copies of them anywhere, or only use the images for analysis etc."
	# DESCRIPTION
	# Retrieves all the HTML for all active state archives of the Humanae project (at Tumblr) which the Wayback Machine archived until 2019-07-26, parses the large portrait JPG URLs out of all that, and retrieves all the large JPGs, in a folder. Result is 3,326 portrait images. At this writing, whether any of those are duplicates has not been determined. The JPG URLs parsed out of the HTML source files have some identical file names at different URLs, and partial analysis suggests they are all in fact the same files at different web locations in Tumblr. Also, it happens at this writing that all the image URLs are still live, although the links to them at the Tumblr blog are down. Pulling images out of the Wayback machine, if that ever becomes necessary, might be more difficult.

	# DEPENDENCIES
	# Ruby, the wayback_machine_downloader gem, and (if you're on Windows) MSYS2. On other platforms, a Bash environment with what GNU/Linux core utils this scirpt uses. Maybe other things that Ruby may need on platforms other than Windows.

	# USAGE
	# Install the necessary dependencies and run this script from a Bash/MSYS2 environment.
	# To bypass prompts to wipe / recreate target direcotires, pass any parameter to the script:
	# get_Humanae_large_JPGs_from_Wayback_Machine.sh FOO
	# To run normally, run without any parameter:
	# get_Humanae_large_JPGs_from_Wayback_Machine.sh
	# Intermediary HTML is placed in a new ./wayback_machine_html folder. Final JPG collection is placed in a _recovered_Humanae_Tumblr_large_jpgs folder.


	# CODE
	if [ -d wayback_machine_html ] && [ ! "$1" ]
	then
	read -p "Wayback_machine_html directory already exists. Wipe it and recreate it (Y/N)?" USERINPUT
	if [ "$USERINPUT" == "Y" ] \|\| [ "$USERINPUT" == "y" ]
	then
	rm -rf wayback_machine_html
	fi
	fi
	# Create expected subdir if doesn't exist.
	if [ ! -d wayback_machine_html ]
	then
	mkdir wayback_machine_html
	fi

	# Retrieve all archives from start date to date of the last snapshot before the Tumblr blog's pages were remove:
	wayback_machine_downloader humanae.tumblr.com -t 20190726 -f 20120607 -d wayback_machine_html

	cd wayback_machine_html
	allFileNamesArray=( $(find . -type f -iname ".") )

	echo "Beginning parsing of files from wayback machine archives for jpg image URLs . ."
	# example commands that work to filter jpg URLs out of a file:
	# tr ' ' '\n' < index.html > tmp.txt
	# grep -o -h "http[^\"}{]*.jpg" tmp.txt
	# adapting those example commands:
	printf "" > ../tmp_url_parsing_25eEK75FJ.txt
	print "" > ../all_jpgs.txt
	for fileName in ${allFileNamesArray[@]}
	do
	echo parsing file $fileName . . .
	tr ' ' '\n' < $fileName > ../tmp_url_parsing_25eEK75FJ.txt
	# this regex gets all jpgs:
	# grep -o -h "http[^\"}{]*\.jpg" ../tmp_url_parsing_25eEK75FJ.txt >> ../all_jpgs.txt
	# -- but we only want the large jpgs, which all end with *._1280.jpg; which this gets:
	grep -o -h "http[^\"}{]*\_1280.jpg" ../tmp_url_parsing_25eEK75FJ.txt >> ../all_large_jpgs.txt
	done

	rm ../tmp_url_parsing_25eEK75FJ.txt
	cd ..

	echo "DONE extracting .jpg URLs. They are all in all_large_jpgs.txt. Deduplicating that . . ."

	lines=($(<all_large_jpgs.txt))
	OIFS="$IFS"
	IFS=$'\n'
	lines=($(sort <<<"${lines[*]}"))
	lines=($(uniq <<<"${lines[*]}"))
	OIFS="$IFS"
	printf '%s\n' "${lines[@]}" > all_large_jpgs.txt

	echo "DONE deduplicating all_large_jpgs.txt."

	if [ -d _collected_jpgs ] && [ ! "$1" ]
	then
	read -p "_collected_jpgs directory already exists. Wipe it and recreate it (Y/N)?" USERINPUT
	if [ "$USERINPUT" == "Y" ] \|\| [ "$USERINPUT" == "y" ]
	then
	rm -rf _collected_jpgs
	fi
	fi

	if [ ! -d _collected_jpgs ]
	then
	mkdir _collected_jpgs
	fi

	echo "Will now retrieve all images from that list, and skip images with duplicate file names . . ."
	allJPGurls=( $(<all_large_jpgs.txt) )
	for jpgURL in ${allJPGurls[@]}
	do
	filenameNoPath=${jpgURL##*/}
	if [ ! -f ./_collected_jpgs/$filenameNoPath ]
	then
	echo retrieving $jpgURL . . .
	wget $jpgURL
	mv "$filenameNoPath" ./_collected_jpgs/"$possibleDuplicateFileTag""$filenameNoPath"
	else
	echo "Will not re-retrieve nor clobber target file ./_collected_jpgs/$filenameNoPath, which already exists. Skip."
	fi
	done

	echo "DONE. Collected jpgs are in the ./_collected_jpgs directory. RESPECT THE COPYRIGHT OWNER and only do things like post shrunk (fair use) copies of them anywhere, or only use the images for analysis etc."