Skip to content

Instantly share code, notes, and snippets.

@jbaiter
Last active June 12, 2017 12:57
Show Gist options
  • Save jbaiter/198be6e2c86c030415fb218e807c37b2 to your computer and use it in GitHub Desktop.
Save jbaiter/198be6e2c86c030415fb218e807c37b2 to your computer and use it in GitHub Desktop.
def align(truth_lines, ocr_lines):
nonaligned = []
aligned = []
align_idx = 0
for truth_line in truth_lines:
best_error = 1.0
best_align = 0
for idx, ocr_line in enumerate(ocr_lines[align_idx+1:],
align_idx+1):
total_error = levenshtein(truth_line, ocr_line)
error = total_error / len(truth_line)
if error < best_error:
best_error = error
best_align = idx
if best_error < 0.5:
num_aligned += 1
align_idx = best_align
aligned.append((align_idx, truth_line))
logger.debug(
u"Matched `{}` <-> `{}` ({})".format(
truth_line, ocr_text[best_align], best_error))
else:
nonaligned.append(truth_line)
if best_align < len(ocr_text):
logger.debug(
u"Could not find match for '{}', closest candidate "
u"was '{}' with an error of {}"
.format(truth_line, ocr_text[best_align], best_error))
return aligned, nonaligned
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment