J0hnL0cke · January 20, 2025 22:32
diff --git a/mhtml-to-pdf.sh b/mhtml-to-pdf.sh
 # MHTML To PDF Fetcher
 # This short bash script opens .mhtml files, finds the original download url, then uses wkhtmltopdf to download a pdf from that site.
 # Inspired by the lack of support for .mhtml in e-readers and the lack of options for converting them to other formats.

 # Limitations:
 # curl cannot use javascript, and thus can't get past anti-scripting protections that use JS redirects
 # Because of how `find` escapes special characters, file names with special characters may turn out weird
 # Sometimes wkhtmltopdf just gets stuck on a page. Press ctrl+C 3 times to break out of the container and skip to the next one

 # Before using:
 # Install docker
 # Set the number on the `tail` command to strip file paths (ie "./Readings/filename.mhtml" -> strip first 8 chars -> "filename.mhtml" )
 # Run in a directory where you have run `mkdir ./out` and have mhtml files in current folder or subfolders
 # To customize where the program searches, for files, edit the path used by the `find` command
 # The search directory does not need to be mounted into the docker container, since the container only uses URLs


 # recursively find all mhtml in a directory
 IFS=$'\n' && for filename in $(find ./ -iname "*.mhtml" -type f )

 do
  # open the file and grep for the content location tag (Snapshot-Content-Location: https://example.com/x)
  tag=$(grep -m 1 "Snapshot-Content-Location: " $filename)
  
  # trim to just the url
 	url=$(echo $tag | sed -r 's#.*Snapshot-Content-Location: (.*)#\1#')
  
 	# name the output file based on the input file's name
  newname=$(grep -m 1 -o "<title>[^<]*" $filename | tail -c+8 | sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g') 
  
 	# set the path to save to, append file extension
  newname="./data/out/$newname.pdf"
  
 	echo "processing file ( $newname ) ($filename) ($url)"
  
 	# run wkhtmltopdf on an auto-deleting docker container, and mount the current directory into the container
 	sudo docker run --rm --volume "./:/data" --user `id -u`:`id -g` madnight/docker-alpine-wkhtmltopdf $url $newname
 	
 done && unset IFS
	# MHTML To PDF Fetcher
	# This short bash script opens .mhtml files, finds the original download url, then uses wkhtmltopdf to download a pdf from that site.
	# Inspired by the lack of support for .mhtml in e-readers and the lack of options for converting them to other formats.

	# Limitations:
	# curl cannot use javascript, and thus can't get past anti-scripting protections that use JS redirects
	# Because of how `find` escapes special characters, file names with special characters may turn out weird
	# Sometimes wkhtmltopdf just gets stuck on a page. Press ctrl+C 3 times to break out of the container and skip to the next one

	# Before using:
	# Install docker
	# Set the number on the `tail` command to strip file paths (ie "./Readings/filename.mhtml" -> strip first 8 chars -> "filename.mhtml" )
	# Run in a directory where you have run `mkdir ./out` and have mhtml files in current folder or subfolders
	# To customize where the program searches, for files, edit the path used by the `find` command
	# The search directory does not need to be mounted into the docker container, since the container only uses URLs


	# recursively find all mhtml in a directory
	IFS=$'\n' && for filename in $(find ./ -iname "*.mhtml" -type f )

	do
	# open the file and grep for the content location tag (Snapshot-Content-Location: https://example.com/x)
	tag=$(grep -m 1 "Snapshot-Content-Location: " $filename)

	# trim to just the url
	url=$(echo $tag \| sed -r 's#.Snapshot-Content-Location: (.)#\1#')

	# name the output file based on the input file's name
	newname=$(grep -m 1 -o "<title>[^<]*" $filename \| tail -c+8 \| sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g')

	# set the path to save to, append file extension
	newname="./data/out/$newname.pdf"

	echo "processing file ( $newname ) ($filename) ($url)"

	# run wkhtmltopdf on an auto-deleting docker container, and mount the current directory into the container
	sudo docker run --rm --volume "./:/data" --user `id -u`:`id -g` madnight/docker-alpine-wkhtmltopdf $url $newname

	done && unset IFS