Skip to content

Instantly share code, notes, and snippets.

@lovasoa
Created May 6, 2017 16:48
Show Gist options
  • Save lovasoa/5d1ec19ffd713f1578a13218f1231200 to your computer and use it in GitHub Desktop.
Save lovasoa/5d1ec19ffd713f1578a13218f1231200 to your computer and use it in GitHub Desktop.
Find optimal parameters for tesseract OCR, given a set of input files and expected output.
#!/usr/bin/env python3
import Levenshtein
import scipy.optimize
import subprocess
import glob
import sys
p = subprocess.Popen(["tesseract", "--print-parameters"], stdout=subprocess.PIPE)
args, _ = p.communicate()
args_name=[]
args_vec=[]
for argline in args.decode().split("\n"):
try:
[argname, val, desc] = argline.split("\t")
val = float(val)
if "debug" not in argname:
args_name.append(argname)
args_vec.append(val)
except: pass
objective_text = "\n".join(open(f).read() for f in glob.glob("test_data/*txt"))
additional_args = sys.argv[1:]
def eval(vec):
decoded = ""
processes = []
# Run processes in parallel
for image in glob.glob("test_data/*png"):
cmd_line = ["tesseract", image, "stdout"] + additional_args
for nameval in zip(args_name, vec):
cmd_line.append("-c")
cmd_line.append("%s=%f" % nameval)
p = subprocess.Popen(cmd_line, stdout=subprocess.PIPE)
processes.append(p)
for p in processes:
res, _ = p.communicate()
decoded += res.decode()
print(decoded)
return Levenshtein.distance(decoded, objective_text)
result = scipy.optimize.fmin_powell(eval, args_vec, maxiter=1e4, maxfun=1e4)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment