Skip to content

Instantly share code, notes, and snippets.

@Witiko
Created October 16, 2020 21:53
Show Gist options
  • Save Witiko/1f92c84b030f7ed2e5ff2b67a4710409 to your computer and use it in GitHub Desktop.
Save Witiko/1f92c84b030f7ed2e5ff2b67a4710409 to your computer and use it in GitHub Desktop.
Creates a pie chart from a GNU Parallel joblog after running OCR-D
# -*- coding:utf-8 -*-
from itertools import dropwhile
import json
import re
import sys
import matplotlib.pyplot as plt
JOBLOG_FILENAME = sys.argv[1]
OUTPUT_FILENAME = sys.argv[2]
COMMAND_REGEX = re.compile(r'.* -O ([^" ]*).*|.* (ocrd-import) .*')
LABEL_MAP = {
"ocrd-import": "ocrd-import",
"OCR-D-BIN": "olena-binarize",
"OCR-D-CROP": "anybaseocr-crop",
"OCR-D-BIN2": "olena-binarize",
"OCR-D-BIN-DENOISE": "cis-ocropy-denoise",
"OCR-D-BIN-DENOISE-DESKEW": "cis-ocropy-deskew",
"OCR-D-SEG-REG": "tesserocr-segment-region",
"OCR-D-SEG-REPAIR": "segment-repair",
"OCR-D-SEG-REG-DESKEW": "cis-ocropy-deskew",
"OCR-D-SEG-REG-DESKEW-CLIP": "cis-ocropy-clip",
"OCR-D-SEG-LINE": "cis-ocropy-segment",
"OCR-D-SEG-REPAIR-LINE": "segment-repair",
"OCR-D-SEG-LINE-RESEG-DEWARP": "cis-ocropy-dewarp",
"OCR-D-OCR": "calamari-recognize",
"OCR-D-TEXT": "fileformat-transform",
"OCR-D-HOCR": "fileformat-transform",
}
def read_joblog(filename):
successfully_read = 0
with open(filename, 'rt') as f:
lines = iter(f)
next(lines)
for line in lines:
line = line.rstrip('\r\n').split('\t')
runtime = float(line[3])
exit_code = int(line[6])
command = line[-1]
successfully_read += 1
yield (command, runtime, exit_code)
def evaluate():
runtimes = {}
joblog = read_joblog(JOBLOG_FILENAME)
for command, runtime, exit_code in joblog:
if exit_code != 0:
continue
match = re.fullmatch(COMMAND_REGEX, command)
command = list(dropwhile(lambda x: x is None, match.groups()))[0]
label = LABEL_MAP[command]
if label not in runtimes:
runtimes[label] = []
runtimes[label].append(runtime)
return runtimes
def pie_chart(runtimes):
labels, sizes = zip(*runtimes.items())
sizes = list(map(sum, sizes))
colors = plt.get_cmap('tab20').colors
patches, texts = plt.pie(sizes, colors=colors)
plt.legend(patches, labels, loc='best')
plt.axis('equal')
plt.tight_layout()
plt.savefig(OUTPUT_FILENAME)
def main():
runtimes = evaluate()
pie_chart(runtimes)
if __name__ == '__main__':
main()
@b2m
Copy link

b2m commented Oct 19, 2020

COMMAND_REGEX = re.compile(r'.* -O ([^" ]*).*|.* (ocrd-import) .*')
...
match = re.fullmatch(COMMAND_REGEX, command)
command = list(dropwhile(lambda x: x is None, match.groups()))[0]
label = LABEL_MAP[command]

Just wondering why you need the mapping from output group to command in LABEL_MAP?
The command you are searching for is also in the text and retrievable via regex (r'ocrd process "([^" ]+)|(ocrd-import)').

@Witiko
Copy link
Author

Witiko commented Oct 19, 2020

Just wondering why you need the mapping from output group to command in LABEL_MAP?
The command you are searching for is also in the text and retrievable via regex (r'ocrd process "([^" ]+)|(ocrd-import)').

You are right, we could simplify things and get rid of LABEL_MAP. Then again, the solution with COMMAND_REGEX and LABEL_MAP is more general, will work for many sorts of joblogs beside OCR-D (in my code base, both LABEL_MAP and COMMAND_REGEX are command-line arguments), and allows you to name the labels independently on the command names. For example, you could merge cis-ocropy-clip, cis-ocropy-segment, and cis-ocropy-dewarp into a single slice named cis-ocropy:

LABEL_MAP = {
  # [...]
  "OCR-D-BIN-DENOISE": "cis-ocropy",
  "OCR-D-BIN-DENOISE-DESKEW": "cis-ocropy",
  "OCR-D-SEG-REG-DESKEW": "cis-ocropy",
  "OCR-D-SEG-REG-DESKEW-CLIP": "cis-ocropy",
  "OCR-D-SEG-LINE": "cis-ocropy",
  "OCR-D-SEG-LINE-RESEG-DEWARP": "cis-ocropy",
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment