benizar · February 14, 2022 18:48
diff --git a/README.md b/README.md
diff --git a/_pdf-break-down.sh b/_pdf-break-down.sh
 #!/bin/bash -e

 #TODO: Ensure usability from any path

 for FILE in "$@"
 do
  case "$FILE" in
  *.pdf ) 
      # it's a pdf
      echo "Processing '$FILE'"
      
      # Sanitize filename
      FILENAME=$(basename "$FILE")
      MAIN_DIR=$(echo "${FILENAME%.*}" \
        | tr "[:upper:]" "[:lower:]" \
        | tr 'áÁàÀãÃâÂéÉêÊíÍóÓõÕôÔúÚñÑçÇªº' 'aAaAaAaAeEeEiIoOoOoOuUnNcCao' \
        | tr -cd 'A-Za-z0-9_-')
      RAWTEXT=$(echo "$MAIN_DIR" | tr '_' '-')
      
      PAGE_DIR="$MAIN_DIR"/pages
      IMAGE_DIR="$MAIN_DIR"/images
      
      RAWTEXT_FILE=./$MAIN_DIR/$(echo "${FILENAME%.*}" | tr '_' '-').md

      # Create a working directory
      echo "Creating a working directory"
      mkdir "$MAIN_DIR"
      mkdir "$PAGE_DIR"
      mkdir "$IMAGE_DIR"

      echo "PDF Burst"
      pdftk $FILE burst output "$PAGE_DIR"/page_%02d.pdf
      

      # Append yaml header
 cat << EOF >> $RAWTEXT_FILE
 ---
 # Edit if you want to overwrite defaults
 language: catalan
 draft: false

 title: Slides title
 keywords: keyword1, keyword2, keyword3

 #nocite: |
 #  @authorYearTopic
 ---

 # Introducció

 EOF



      for PAGE in "$PAGE_DIR"/*.pdf
      do

        # Extract text
        echo "Extracting raw text from PDF."
        echo '' >> $RAWTEXT_FILE
        echo '' >> $RAWTEXT_FILE
        echo '## New slide' >> $RAWTEXT_FILE
        echo '' >> $RAWTEXT_FILE
        pdftotext -enc UTF-8 -layout $PAGE - >> $RAWTEXT_FILE
        
        # Extract all images in PNG format
        echo "Extracting all images in PNG format to '$IMAGE_DIR' (Page '$PAGE')"
        pdfimages -png "$PAGE" ./$IMAGE_DIR/$(basename ${PAGE%.*})

     
        a=($(pdfimages -list "$PAGE" | wc -l))
        lines=${a[0]}
        slidepics="${lines}"-2
        #words=${a[1]}
        #chars=${a[2]}
        
        for ((n=0;n<${slidepics};n++))
        do
          echo -e "![]("$(basename ${PAGE%.*})-"${n}"".png)" >> $RAWTEXT_FILE
        done
        
        
        #mogrify -format png ./$FOLDER/*.ppm

        # Remove intermediate images (ppm,pgm or pbm)
        #find . -type f -not \(-name '*gz' -or -name '*odt' -or -name '*.jpg' \) -delete
        #find ./"$FOLDER" -type f -not -name '*.png' -delete
     
      done
      
      # Clean text (interactive, regex)
      sed -r 's///g' -i $RAWTEXT_FILE # Remove page breaks
      sed -r 's///g' -i $RAWTEXT_FILE # Private use area
      sed -r 's///g' -i $RAWTEXT_FILE # Private use area
      
      # Remove consecutive characters
      sed -r 's/ {2,}/ /g' -i $RAWTEXT_FILE
      # Remove leading and trailing whitespace
      sed 's/^[ \t]*//;s/[ \t]*$//' -i $RAWTEXT_FILE
      
      # Markdown lists
      sed -r 's/–/-/g' -i $RAWTEXT_FILE # Remove page breaks
      sed -r "s/’/'/g" -i $RAWTEXT_FILE # Remove page breaks
      sed -r "s/…/.../g" -i $RAWTEXT_FILE # Remove page breaks
      
      #“”
      
      
      # Insert a references section
      echo '' >> $RAWTEXT_FILE
      echo '## References' >> $RAWTEXT_FILE
      echo '' >> $RAWTEXT_FILE

    ;;
    *)
      # it's not a pdf
      echo "This is not a PDF: $FILE"
    ;;
  esac

 done


 echo "Mission accomplished ;)"
	#!/bin/bash -e

	#TODO: Ensure usability from any path

	for FILE in "$@"
	do
	case "$FILE" in
	*.pdf )
	# it's a pdf
	echo "Processing '$FILE'"

	# Sanitize filename
	FILENAME=$(basename "$FILE")
	MAIN_DIR=$(echo "${FILENAME%.*}" \
	\| tr "[:upper:]" "[:lower:]" \
	\| tr 'áÁàÀãÃâÂéÉêÊíÍóÓõÕôÔúÚñÑçÇªº' 'aAaAaAaAeEeEiIoOoOoOuUnNcCao' \
	\| tr -cd 'A-Za-z0-9_-')
	RAWTEXT=$(echo "$MAIN_DIR" \| tr '_' '-')

	PAGE_DIR="$MAIN_DIR"/pages
	IMAGE_DIR="$MAIN_DIR"/images

	RAWTEXT_FILE=./$MAIN_DIR/$(echo "${FILENAME%.*}" \| tr '_' '-').md

	# Create a working directory
	echo "Creating a working directory"
	mkdir "$MAIN_DIR"
	mkdir "$PAGE_DIR"
	mkdir "$IMAGE_DIR"

	echo "PDF Burst"
	pdftk $FILE burst output "$PAGE_DIR"/page_%02d.pdf


	# Append yaml header
	cat << EOF >> $RAWTEXT_FILE
	---
	# Edit if you want to overwrite defaults
	language: catalan
	draft: false

	title: Slides title
	keywords: keyword1, keyword2, keyword3

	#nocite: \|
	# @authorYearTopic
	---

	# Introducció

	EOF



	for PAGE in "$PAGE_DIR"/*.pdf
	do

	# Extract text
	echo "Extracting raw text from PDF."
	echo '' >> $RAWTEXT_FILE
	echo '' >> $RAWTEXT_FILE
	echo '## New slide' >> $RAWTEXT_FILE
	echo '' >> $RAWTEXT_FILE
	pdftotext -enc UTF-8 -layout $PAGE - >> $RAWTEXT_FILE

	# Extract all images in PNG format
	echo "Extracting all images in PNG format to '$IMAGE_DIR' (Page '$PAGE')"
	pdfimages -png "$PAGE" ./$IMAGE_DIR/$(basename ${PAGE%.*})


	a=($(pdfimages -list "$PAGE" \| wc -l))
	lines=${a[0]}
	slidepics="${lines}"-2
	#words=${a[1]}
	#chars=${a[2]}

	for ((n=0;n<${slidepics};n++))
	do
	echo -e "![]("$(basename ${PAGE%.*})-"${n}"".png)" >> $RAWTEXT_FILE
	done


	#mogrify -format png ./$FOLDER/*.ppm

	# Remove intermediate images (ppm,pgm or pbm)
	#find . -type f -not \(-name 'gz' -or -name 'odt' -or -name '*.jpg' \) -delete
	#find ./"$FOLDER" -type f -not -name '*.png' -delete

	done

	# Clean text (interactive, regex)
	sed -r 's///g' -i $RAWTEXT_FILE # Remove page breaks
	sed -r 's///g' -i $RAWTEXT_FILE # Private use area
	sed -r 's///g' -i $RAWTEXT_FILE # Private use area

	# Remove consecutive characters
	sed -r 's/ {2,}/ /g' -i $RAWTEXT_FILE
	# Remove leading and trailing whitespace
	sed 's/^[ \t]//;s/[ \t]$//' -i $RAWTEXT_FILE

	# Markdown lists
	sed -r 's/–/-/g' -i $RAWTEXT_FILE # Remove page breaks
	sed -r "s/’/'/g" -i $RAWTEXT_FILE # Remove page breaks
	sed -r "s/…/.../g" -i $RAWTEXT_FILE # Remove page breaks

	#“”


	# Insert a references section
	echo '' >> $RAWTEXT_FILE
	echo '## References' >> $RAWTEXT_FILE
	echo '' >> $RAWTEXT_FILE

	;;
	*)
	# it's not a pdf
	echo "This is not a PDF: $FILE"
	;;
	esac

	done


	echo "Mission accomplished ;)"