Last active
March 16, 2022 21:16
-
-
Save alecjacobson/69402c750787efc56b3a2a4cac590a82 to your computer and use it in GitHub Desktop.
This script will attempt to create a copy of the input tex directory whose total size is less than the 10000 KBs (i.e., 10MBs) limit of ArXiv.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [ -z "$1" ];then | |
echo "USAGE: | |
procrusteanarxiv path/to/input/dir/containing/tex/files/ | |
This script tested with dependencies: | |
gs Ghostscript version 9.27 (9.21 is buggy) | |
latexmk version 4.52c | |
rsync version 2.6.9 | |
imagemagick version 7.0.8-12 | |
This script will attempt to create a copy of the input tex directory whose total | |
size is less than the 10000 KBs (i.e., 10MBs) limit of ArXiv. It first tries: | |
1. deleting all unnecessary files. | |
If this doesn't work, it tries to | |
2. repack all .pdf's using high-quality jpeg compression without downsampling. | |
If that is still not enough, it will | |
3. run a binary search on the downsampling resolution to apply to all .pdf's. | |
" | |
exit 0 | |
fi | |
# gs version 9.21 has a bug that makes downsampled images ugly. Appears to be | |
# fixed by at least 9.27 | |
if [[ -z "${GS}" ]]; then | |
MY_GS="/Users/ajx/Downloads/ghostscript-9.27/bin/gs" | |
else | |
MY_GS="${GS}" | |
fi | |
get_dir_size_in_b() | |
{ | |
find "$1" ! -type d -print0 | xargs -0 stat -f '%z' | awk '{sum += $1} END{print sum}' | |
} | |
get_dir_size_in_kb() | |
{ | |
#du -k -d0 "$1" | awk '{print $1;}' | |
b=$(get_dir_size_in_b "$1") | |
expr '(' "$b" ')' / 1000 | |
} | |
get_file_size_in_kb() | |
{ | |
du -k "$1" | cut -f1 | |
} | |
gs_preset() | |
{ | |
gs_file="$1" | |
gstmp="/var/tmp/gstmp.pdf" | |
$MY_GS -sOutputFile="$gstmp" \ | |
-dQUIET \ | |
-dNOPAUSE -dBATCH \ | |
-sDEVICE=pdfwrite \ | |
-dPDFSETTINGS=/$2 \ | |
-f "$gs_file" | |
mv "$gstmp" "$gs_file" | |
} | |
gs_compress() | |
{ | |
res="$3" | |
qfactor="$4" | |
$MY_GS -sOutputFile="$2" \ | |
-dQUIET \ | |
-dNOPAUSE -dBATCH \ | |
-sDEVICE=pdfwrite \ | |
-c ".setpdfwrite << \ | |
/AlwaysEmbed [] \ | |
/AntiAliasColorImages //false \ | |
/AntiAliasGrayImages //false \ | |
/AntiAliasMonoImages //false \ | |
/ASCII85EncodePages //false \ | |
/AutoFilterColorImages //true \ | |
/AutoFilterGrayImages //true \ | |
/AutoPositionEPSFiles //true \ | |
/Binding /Left \ | |
/CalCMYKProfile (None) \ | |
/CalGrayProfile (None) \ | |
/CalRGBProfile (None) \ | |
/ColorImageDepth -1 \ | |
/ColorImageDict .defaultImageDict \ | |
/ColorImageDownsampleThreshold 1.5 \ | |
/ColorImageFilter /DCTEncode \ | |
/CompressPages //true \ | |
/ConvertImagesToIndexed //true \ | |
/DefaultRenderingIntent /Default \ | |
/DetectBlends //true \ | |
/DownsampleColorImages //true \ | |
/DownsampleGrayImages //true \ | |
/DownsampleMonoImages //true \ | |
/EmitDSCWarnings //false \ | |
/EncodeColorImages //true \ | |
/EncodeGrayImages //true \ | |
/EncodeMonoImages //true \ | |
/EndPage -1 \ | |
/GrayImageDepth -1 \ | |
/GrayImageDict .defaultImageDict \ | |
/GrayImageDownsampleThreshold 1.5 \ | |
/GrayImageFilter /DCTEncode \ | |
/ImageMemory 524288 \ | |
/LockDistillerParams //false \ | |
/MaxSubsetPct 100 \ | |
>> setdistillerparams" \ | |
-c ".setpdfwrite << \ | |
/MonoImageDepth -1 \ | |
/MonoImageDict mark \ | |
/K -1 \ | |
.dicttomark readonly \ | |
/MonoImageDownsampleThreshold 1.5 \ | |
/MonoImageFilter /CCITTFaxEncode \ | |
/OffOptimizations 0 \ | |
/OPM 1 \ | |
/Optimize //true \ | |
/ParseDSCComments //true \ | |
/ParseDSCCommentsForDocInfo //true \ | |
/PDFXTrimBoxToMediaBoxOffset [0 0 0 0] \ | |
/PDFXSetBleedBoxToMediaBox //true \ | |
/PDFXBleedBoxToTrimBoxOffset [0 0 0 0] \ | |
/PreserveCopyPage //true \ | |
/PreserveHalftoneInfo //false \ | |
/sRGBProfile (None) \ | |
/StartPage 1 \ | |
/SubsetFonts //true \ | |
/TransferFunctionInfo /Preserve \ | |
/UseFlateCompression //true \ | |
/UsePrologue //false \ | |
/PassThroughJPEGImages //true \ | |
/AutoRotatePages /None \ | |
/CannotEmbedFontPolicy /Error \ | |
/ColorACSImageDict << /QFactor $qfactor /Blend 1 /ColorTransform 1 /HSamples [2 1 1 2] /VSamples [2 1 1 2] >> \ | |
/ColorConversionStrategy /LeaveColorUnchanged \ | |
/ColorImageDownsampleType /Subsample \ | |
/ColorImageResolution $res \ | |
/CompatibilityLevel 1.7 \ | |
/CreateJobTicket //true \ | |
/DoThumbnails //true \ | |
/EmbedAllFonts //true \ | |
/GrayACSImageDict .prepressACSImageDict \ | |
/GrayImageDownsampleType /Bicubic \ | |
/GrayImageResolution 300 \ | |
/MonoImageDownsampleType /Subsample \ | |
/MonoImageResolution 1200 \ | |
/NeverEmbed [] \ | |
/PreserveEPSInfo //true \ | |
/PreserveOPIComments //true \ | |
/PreserveOverprintSettings //true \ | |
/UCRandBGInfo /Preserve \ | |
>> setdistillerparams" \ | |
-f "$1" | |
} | |
# https://stackoverflow.com/a/10453202/148668 | |
# gs_compress input.pdf res jpeg | |
# gs_compress input.pdf 72 0.01 | |
gs_compress_inplace() | |
{ | |
gstmp="/var/tmp/gstmp.pdf" | |
gs_compress "$1" $gstmp $2 $3 | |
mv "$gstmp" "$1" | |
} | |
png2jpg() | |
{ | |
echo " [debug] $1 $2" | |
convert "$1" -background white -flatten -alpha off -resize 2048x2048\> -quality 90 "$2" | |
} | |
png2jpg_inplace() | |
{ | |
png2jpgtmp="/var/tmp/png2jpgtmp.jpg" | |
png2jpg "$1" "$png2jpgtmp" | |
mv "$png2jpgtmp" "$1" | |
} | |
png2jpg_all() | |
{ | |
find . -type f -name "*.png" -print0 | while IFS= read -r -d $'\0' file; do | |
# convert each .png file in place into a .jpg format (latex doesn't seem to | |
# care about extension and this avoid needing to edit the .tex files to find | |
# the new files) | |
png2jpg_inplace "$file" | |
done | |
find . -type f -name "*.jpg" -print0 | while IFS= read -r -d $'\0' file; do | |
png2jpg_inplace "$file" | |
done | |
} | |
# Returns the largest integer i for which `command j` succeeds (exits with a | |
# null exit code) where j = 10*i | |
function dichotomic_search_ten { | |
min=$1 | |
max=$2 | |
command=$3 | |
while [ $min -lt $max ]; do | |
# Compute the mean between min and max, rounded up to the superior unit | |
current=`expr '(' "$min" + "$max" + 1 ')' / 2` | |
current_times_ten=`expr 10 \* "$current" ` | |
if $command $current_times_ten 1>&2 | |
then min=$current | |
else max=`expr $current - 1` | |
fi | |
done | |
echo $min | |
} | |
input_dir="$1" | |
input_base="$(basename "$input_dir")" | |
stripped_dir="procrustean-arxiv-stripped" | |
compress="procrustean-arxiv-compress" | |
under="$input_base-procrustean-arxiv" | |
############################################################################## | |
# Create a local copy | |
############################################################################## | |
rsync -r --exclude=.git "$input_dir" "$stripped_dir" --delete | |
size=$(get_dir_size_in_kb "$stripped_dir" ) | |
echo "Original size: $size KBs" | |
cd "$stripped_dir" | |
# determine main tex file | |
main_tex=$(grep -l -m 1 "^[^\%]*\\\\documentclass" *.tex) | |
if [ -z "$main_tex" ];then | |
echo "could not find main file." | |
exit -1 | |
fi | |
############################################################################## | |
# Use latexmk to build the document as a pdf and extract file list | |
############################################################################## | |
# determine basename (e.g., of .pdf or .fls) | |
main_base="${main_tex%.*}" | |
# Generating a latexmk file | |
echo "\$pdf_mode = 1; | |
\$dvi_mode = \$postscript_mode = 0; | |
\$pdflatex = 'pdflatex --shell-escape -synctex=1 %O %S'; | |
@default_files = ( '$main_tex' );" > latexmkrc | |
if ! latexmk -silent 2>/dev/null ; then | |
echo "latexmk failed" | |
exit -1; | |
fi | |
# Check whether every file exists in .fls | |
find . -type f -print0 | while IFS= read -r -d $'\0' file; do | |
# strip "./" off the front | |
file=${file:2} | |
#echo "grep -qF $file $main_base.fls" | |
if [ "$file" == "$main_base.fls" ]; then | |
: | |
elif [ "$file" == "latexmkrc" ]; then | |
: | |
elif [ "$file" == "$main_base.pdf" ]; then | |
# just immediately remove the main pdf (it doesn't actually matter) | |
rm "$file" | |
continue; | |
elif ! grep -iqF "$file" "$main_base.fls" ; then | |
rm "$file" | |
continue; | |
fi | |
size=$(get_file_size_in_kb "$file") | |
done | |
# clean up latexmk junk | |
latexmk -C -silent 2>/dev/null | |
# pop up | |
cd ../ | |
size=$(get_dir_size_in_kb "$stripped_dir" ) | |
echo "Removing unused files: $size KBs" | |
if [ "$size" -le "10000" ]; then | |
rsync -r --exclude=.git "$stripped_dir/" "$under" --delete | |
else | |
############################################################################ | |
# Try simply converting to jpeg without downsampling | |
############################################################################ | |
rsync -r --exclude=.git "$stripped_dir/" "$compress" --delete | |
cd "$compress" | |
png2jpg_all | |
find . -type f -name "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do | |
# compress each .pdf file in place | |
echo " [debug] $file ..." | |
gs_preset "$file" default | |
done | |
cd .. | |
size=$(get_dir_size_in_kb "$compress" ) | |
echo "JPEG compression without downsampling: $size KBs" | |
if [ "$size" -le "10000" ]; then | |
rsync -r --exclude=.git "$compress/" "$under" --delete | |
else | |
########################################################################## | |
# Binary search over downsampling resolutions | |
########################################################################## | |
# compress_and_measure 600 | |
compress_and_measure() | |
{ | |
res="$1" | |
echo " [debug] res=$1" | |
rsync -r --exclude=.git "$stripped_dir/" "$compress" --delete | |
cd "$compress" | |
png2jpg_all | |
# compress and downsample each .pdf file | |
find . -type f -name "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do | |
echo " [debug] $file ..." | |
gs_compress_inplace "$file" $res 0.1 | |
done | |
cd .. | |
size=$(get_dir_size_in_kb "$compress" ) | |
echo " [debug] size=$size" | |
if [ "$size" -lt "10000" ]; then | |
rsync -r --exclude=.git "$compress/" "$under" --delete | |
return 0; | |
else | |
return 1; | |
fi | |
} | |
ideal_res=$(dichotomic_search_ten 0 100 compress_and_measure) | |
size=$(get_dir_size_in_kb "$under" ) | |
echo "JPEG compression with downsampling to $ideal_res dpi: $size KBs" | |
fi | |
fi | |
size=$(get_dir_size_in_kb "$under" ) | |
echo "---------------------------------------------------------------" | |
echo "Output \".\/$under\/\": $size KBs" | |
# clean up temp directories | |
rm -rf "$compress" | |
rm -rf "$stripped_dir" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment