-
-
Save dmgig/cdd89c7ccce6bd3eded4eca5346c58d0 to your computer and use it in GitHub Desktop.
Batch process files with ABBYY FineReader using AppleScript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- specify input and output directories | |
set infile_directory to "/Users/doug/Desktop/inputs/" | |
set outfile_directory to "/Users/doug/Desktop/outputs/" | |
-- get the basenames of each input file | |
tell application "System Events" | |
set infile_list to files of folder infile_directory | |
end tell | |
-- process each input file | |
repeat with infile in infile_list | |
set infile_name to name of infile | |
set infile to POSIX file (infile_directory & infile_name) | |
set outfile to POSIX file (outfile_directory & infile_name) | |
run_ocr(infile, outfile) | |
end repeat | |
-- main function: run ocr on an infile and save results to an outfile | |
on run_ocr(infile, outfile) | |
-- identify path to ABBYY FineReader | |
set appFile to POSIX file "/Applications/FineReader.app" | |
-- set FineReader parameters | |
using terms from application "FineReader" | |
set langList to {English, Latin} | |
set saveType to single file | |
end using terms from | |
using terms from application "FineReader" | |
set toFile to outfile | |
set retainLayoutWordLayout to as editable copy | |
set keepPageNumberHeadersAndFootersBoolean to yes | |
set keepLineBreaksAndHyphenationBoolean to yes | |
set keepPageBreaksBoolean to yes | |
set increasePaperSizeToFitContentBoolean to yes | |
set keepImageBoolean to yes | |
set imageOptionsImageQualityEnum to high quality | |
set keepTextAndBackgroundColorsBoolean to yes | |
set highlightUncertainSymbolsBoolean to yes | |
set keepPageNumbersBoolean to yes | |
end using terms from | |
WaitWhileBusy() | |
tell application "FineReader" | |
set hasdoc to has document | |
if hasdoc then | |
close document | |
end if | |
end tell | |
WaitWhileBusy() | |
tell application "FineReader" | |
set auto_read to auto read new pages false | |
end tell | |
tell application "Finder" | |
open infile using appFile | |
end tell | |
delay 5 | |
WaitWhileBusy() | |
-- the end of line character below is created by pressing OPTION+ENTER | |
tell application "FineReader" | |
export to html toFile ¬ | |
ocr languages enum langList ¬ | |
saving type saveType ¬ | |
keep line breaks and hyphenation keepLineBreaksAndHyphenationBoolean ¬ | |
keep page numbers headers and footers keepPageNumberHeadersAndFootersBoolean ¬ | |
keep pictures keepImageBoolean ¬ | |
image quality imageOptionsImageQualityEnum ¬ | |
keep text and background colors keepTextAndBackgroundColorsBoolean | |
end tell | |
WaitWhileBusy() | |
-- close the current file | |
tell application "FineReader" | |
auto read new pages auto_read | |
close document | |
end tell | |
end run_ocr | |
-- close ABBYY | |
tell application "FineReader" | |
quit | |
end tell | |
-- helpers to wait for thread to open up | |
on WaitWhileBusy() | |
repeat while IsMainApplicationBusy() | |
end repeat | |
end WaitWhileBusy | |
on IsMainApplicationBusy() | |
tell application "FineReader" | |
set resultBoolean to is busy | |
end tell | |
return resultBoolean | |
end IsMainApplicationBusy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# pip install pyobjc | |
# pip install py-applescript | |
import applescript, os, glob, sys | |
scpt = applescript.AppleScript(''' | |
-- main function: run ocr on an infile and save results to an outfile | |
on run_ocr(infile, outfile) | |
set infile to POSIX file infile | |
set outfile to POSIX file outfile | |
-- identify path to ABBYY FineReader | |
set appFile to POSIX file "/Applications/FineReader.app" | |
-- set FineReader parameters | |
using terms from application "FineReader" | |
set langList to {English, Latin} | |
set saveType to single file | |
end using terms from | |
using terms from application "FineReader" | |
set toFile to outfile | |
set retainLayoutWordLayout to as editable copy | |
set keepPageNumberHeadersAndFootersBoolean to yes | |
set keepLineBreaksAndHyphenationBoolean to yes | |
set keepPageBreaksBoolean to yes | |
set increasePaperSizeToFitContentBoolean to yes | |
set keepImageBoolean to yes | |
set imageOptionsImageQualityEnum to high quality | |
set keepTextAndBackgroundColorsBoolean to yes | |
set highlightUncertainSymbolsBoolean to yes | |
set keepPageNumbersBoolean to yes | |
end using terms from | |
WaitWhileBusy() | |
tell application "FineReader" | |
set hasdoc to has document | |
if hasdoc then | |
close document | |
end if | |
end tell | |
WaitWhileBusy() | |
tell application "FineReader" | |
set auto_read to auto read new pages false | |
end tell | |
tell application "Finder" | |
open infile using appFile | |
end tell | |
delay 5 | |
WaitWhileBusy() | |
-- the end of line character below is created by pressing OPTION+ENTER | |
tell application "FineReader" | |
export to html toFile ¬ | |
ocr languages enum langList ¬ | |
saving type saveType ¬ | |
keep line breaks and hyphenation keepLineBreaksAndHyphenationBoolean ¬ | |
keep page numbers headers and footers keepPageNumberHeadersAndFootersBoolean ¬ | |
keep pictures keepImageBoolean ¬ | |
image quality imageOptionsImageQualityEnum ¬ | |
keep text and background colors keepTextAndBackgroundColorsBoolean | |
end tell | |
WaitWhileBusy() | |
-- close the current file | |
tell application "FineReader" | |
auto read new pages auto_read | |
close document | |
end tell | |
end run_ocr | |
-- close Abbyy | |
tell application "FineReader" | |
quit | |
end tell | |
-- helpers to wait for thread to open up | |
on WaitWhileBusy() | |
repeat while IsMainApplicationBusy() | |
end repeat | |
end WaitWhileBusy | |
on IsMainApplicationBusy() | |
tell application "FineReader" | |
set resultBoolean to is busy | |
end tell | |
return resultBoolean | |
end IsMainApplicationBusy | |
''') | |
infiles = glob.glob('inputs/*') | |
for infile in infiles: | |
infile = os.path.abspath(infile) | |
outfile = os.path.abspath('outputs/' + os.path.basename(infile)) | |
print(' * processing', infile) | |
scpt.call('run_ocr', infile, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment