bash pdf-break-down.sh folder/*
Last active
February 14, 2022 18:48
-
-
Save benizar/c150a476b103579b60ce6d19da4d2678 to your computer and use it in GitHub Desktop.
Extract images and text from a list of PDF files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -e | |
#TODO: Ensure usability from any path | |
for FILE in "$@" | |
do | |
case "$FILE" in | |
*.pdf ) | |
# it's a pdf | |
echo "Processing '$FILE'" | |
# Sanitize filename | |
FILENAME=$(basename "$FILE") | |
MAIN_DIR=$(echo "${FILENAME%.*}" \ | |
| tr "[:upper:]" "[:lower:]" \ | |
| tr 'áÁàÀãÃâÂéÉêÊíÍóÓõÕôÔúÚñÑçǪº' 'aAaAaAaAeEeEiIoOoOoOuUnNcCao' \ | |
| tr -cd 'A-Za-z0-9_-') | |
RAWTEXT=$(echo "$MAIN_DIR" | tr '_' '-') | |
PAGE_DIR="$MAIN_DIR"/pages | |
IMAGE_DIR="$MAIN_DIR"/images | |
RAWTEXT_FILE=./$MAIN_DIR/$(echo "${FILENAME%.*}" | tr '_' '-').md | |
# Create a working directory | |
echo "Creating a working directory" | |
mkdir "$MAIN_DIR" | |
mkdir "$PAGE_DIR" | |
mkdir "$IMAGE_DIR" | |
echo "PDF Burst" | |
pdftk $FILE burst output "$PAGE_DIR"/page_%02d.pdf | |
# Append yaml header | |
cat << EOF >> $RAWTEXT_FILE | |
--- | |
# Edit if you want to overwrite defaults | |
language: catalan | |
draft: false | |
title: Slides title | |
keywords: keyword1, keyword2, keyword3 | |
#nocite: | | |
# @authorYearTopic | |
--- | |
# Introducció | |
EOF | |
for PAGE in "$PAGE_DIR"/*.pdf | |
do | |
# Extract text | |
echo "Extracting raw text from PDF." | |
echo '' >> $RAWTEXT_FILE | |
echo '' >> $RAWTEXT_FILE | |
echo '## New slide' >> $RAWTEXT_FILE | |
echo '' >> $RAWTEXT_FILE | |
pdftotext -enc UTF-8 -layout $PAGE - >> $RAWTEXT_FILE | |
# Extract all images in PNG format | |
echo "Extracting all images in PNG format to '$IMAGE_DIR' (Page '$PAGE')" | |
pdfimages -png "$PAGE" ./$IMAGE_DIR/$(basename ${PAGE%.*}) | |
a=($(pdfimages -list "$PAGE" | wc -l)) | |
lines=${a[0]} | |
slidepics="${lines}"-2 | |
#words=${a[1]} | |
#chars=${a[2]} | |
for ((n=0;n<${slidepics};n++)) | |
do | |
echo -e "-"${n}"".png)" >> $RAWTEXT_FILE | |
done | |
#mogrify -format png ./$FOLDER/*.ppm | |
# Remove intermediate images (ppm,pgm or pbm) | |
#find . -type f -not \(-name '*gz' -or -name '*odt' -or -name '*.jpg' \) -delete | |
#find ./"$FOLDER" -type f -not -name '*.png' -delete | |
done | |
# Clean text (interactive, regex) | |
sed -r 's///g' -i $RAWTEXT_FILE # Remove page breaks | |
sed -r 's///g' -i $RAWTEXT_FILE # Private use area | |
sed -r 's///g' -i $RAWTEXT_FILE # Private use area | |
# Remove consecutive characters | |
sed -r 's/ {2,}/ /g' -i $RAWTEXT_FILE | |
# Remove leading and trailing whitespace | |
sed 's/^[ \t]*//;s/[ \t]*$//' -i $RAWTEXT_FILE | |
# Markdown lists | |
sed -r 's/–/-/g' -i $RAWTEXT_FILE # Remove page breaks | |
sed -r "s/’/'/g" -i $RAWTEXT_FILE # Remove page breaks | |
sed -r "s/…/.../g" -i $RAWTEXT_FILE # Remove page breaks | |
#“” | |
# Insert a references section | |
echo '' >> $RAWTEXT_FILE | |
echo '## References' >> $RAWTEXT_FILE | |
echo '' >> $RAWTEXT_FILE | |
;; | |
*) | |
# it's not a pdf | |
echo "This is not a PDF: $FILE" | |
;; | |
esac | |
done | |
echo "Mission accomplished ;)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment