kristianrl · July 10, 2024 09:38
diff --git a/pdf-ocr-to-txt.sh b/pdf-ocr-to-txt.sh
 # Extract OCR contents in PDF-documents as plain text (.txt)
 # Kristian Risager Larsen, 2024-07
 #
 # Setup:
 # You need to install GhostsScript and Tesseract
 # brew install tesseract tesseract-lang ghostscript
 #
 # Notes:
 # The "-l dan" parameter tells Tesseract to expect Danish text

 for filename in *.pdf; do
  gs -dNOPAUSE -sDEVICE=pngalpha -r300 -sOutputFile="${filename%.pdf}%03d.png" "$filename" -c quit  
 done

 for filename in *.png; do
    tesseract "$filename" "${filename%.png}.txt" -l dan
 done
	# Extract OCR contents in PDF-documents as plain text (.txt)
	# Kristian Risager Larsen, 2024-07
	#
	# Setup:
	# You need to install GhostsScript and Tesseract
	# brew install tesseract tesseract-lang ghostscript
	#
	# Notes:
	# The "-l dan" parameter tells Tesseract to expect Danish text

	for filename in *.pdf; do
	gs -dNOPAUSE -sDEVICE=pngalpha -r300 -sOutputFile="${filename%.pdf}%03d.png" "$filename" -c quit
	done

	for filename in *.png; do
	tesseract "$filename" "${filename%.png}.txt" -l dan
	done