Last active
January 20, 2025 22:32
-
-
Save J0hnL0cke/95dbf624465034e399592c5d9690eb11 to your computer and use it in GitHub Desktop.
Grabs URLs from mhtml files, then converts them to pdf using wkhtmltopdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MHTML To PDF Fetcher | |
# This short bash script opens .mhtml files, finds the original download url, then uses wkhtmltopdf to download a pdf from that site. | |
# Inspired by the lack of support for .mhtml in e-readers and the lack of options for converting them to other formats. | |
# Limitations: | |
# curl cannot use javascript, and thus can't get past anti-scripting protections that use JS redirects | |
# Because of how `find` escapes special characters, file names with special characters may turn out weird | |
# Sometimes wkhtmltopdf just gets stuck on a page. Press ctrl+C 3 times to break out of the container and skip to the next one | |
# Before using: | |
# Install docker | |
# Set the number on the `tail` command to strip file paths (ie "./Readings/filename.mhtml" -> strip first 8 chars -> "filename.mhtml" ) | |
# Run in a directory where you have run `mkdir ./out` and have mhtml files in current folder or subfolders | |
# To customize where the program searches, for files, edit the path used by the `find` command | |
# The search directory does not need to be mounted into the docker container, since the container only uses URLs | |
# recursively find all mhtml in a directory | |
IFS=$'\n' && for filename in $(find ./ -iname "*.mhtml" -type f ) | |
do | |
# open the file and grep for the content location tag (Snapshot-Content-Location: https://example.com/x) | |
tag=$(grep -m 1 "Snapshot-Content-Location: " $filename) | |
# trim to just the url | |
url=$(echo $tag | sed -r 's#.*Snapshot-Content-Location: (.*)#\1#') | |
# name the output file based on the input file's name | |
newname=$(grep -m 1 -o "<title>[^<]*" $filename | tail -c+8 | sed 's/[^a-zA-Z0-9 \_\.\-\/\\]//g') | |
# set the path to save to, append file extension | |
newname="./data/out/$newname.pdf" | |
echo "processing file ( $newname ) ($filename) ($url)" | |
# run wkhtmltopdf on an auto-deleting docker container, and mount the current directory into the container | |
sudo docker run --rm --volume "./:/data" --user `id -u`:`id -g` madnight/docker-alpine-wkhtmltopdf $url $newname | |
done && unset IFS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment