-
-
Save Witiko/1f92c84b030f7ed2e5ff2b67a4710409 to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*- | |
from itertools import dropwhile | |
import json | |
import re | |
import sys | |
import matplotlib.pyplot as plt | |
JOBLOG_FILENAME = sys.argv[1] | |
OUTPUT_FILENAME = sys.argv[2] | |
COMMAND_REGEX = re.compile(r'.* -O ([^" ]*).*|.* (ocrd-import) .*') | |
LABEL_MAP = { | |
"ocrd-import": "ocrd-import", | |
"OCR-D-BIN": "olena-binarize", | |
"OCR-D-CROP": "anybaseocr-crop", | |
"OCR-D-BIN2": "olena-binarize", | |
"OCR-D-BIN-DENOISE": "cis-ocropy-denoise", | |
"OCR-D-BIN-DENOISE-DESKEW": "cis-ocropy-deskew", | |
"OCR-D-SEG-REG": "tesserocr-segment-region", | |
"OCR-D-SEG-REPAIR": "segment-repair", | |
"OCR-D-SEG-REG-DESKEW": "cis-ocropy-deskew", | |
"OCR-D-SEG-REG-DESKEW-CLIP": "cis-ocropy-clip", | |
"OCR-D-SEG-LINE": "cis-ocropy-segment", | |
"OCR-D-SEG-REPAIR-LINE": "segment-repair", | |
"OCR-D-SEG-LINE-RESEG-DEWARP": "cis-ocropy-dewarp", | |
"OCR-D-OCR": "calamari-recognize", | |
"OCR-D-TEXT": "fileformat-transform", | |
"OCR-D-HOCR": "fileformat-transform", | |
} | |
def read_joblog(filename): | |
successfully_read = 0 | |
with open(filename, 'rt') as f: | |
lines = iter(f) | |
next(lines) | |
for line in lines: | |
line = line.rstrip('\r\n').split('\t') | |
runtime = float(line[3]) | |
exit_code = int(line[6]) | |
command = line[-1] | |
successfully_read += 1 | |
yield (command, runtime, exit_code) | |
def evaluate(): | |
runtimes = {} | |
joblog = read_joblog(JOBLOG_FILENAME) | |
for command, runtime, exit_code in joblog: | |
if exit_code != 0: | |
continue | |
match = re.fullmatch(COMMAND_REGEX, command) | |
command = list(dropwhile(lambda x: x is None, match.groups()))[0] | |
label = LABEL_MAP[command] | |
if label not in runtimes: | |
runtimes[label] = [] | |
runtimes[label].append(runtime) | |
return runtimes | |
def pie_chart(runtimes): | |
labels, sizes = zip(*runtimes.items()) | |
sizes = list(map(sum, sizes)) | |
colors = plt.get_cmap('tab20').colors | |
patches, texts = plt.pie(sizes, colors=colors) | |
plt.legend(patches, labels, loc='best') | |
plt.axis('equal') | |
plt.tight_layout() | |
plt.savefig(OUTPUT_FILENAME) | |
def main(): | |
runtimes = evaluate() | |
pie_chart(runtimes) | |
if __name__ == '__main__': | |
main() |
COMMAND_REGEX = re.compile(r'.* -O ([^" ]*).*|.* (ocrd-import) .*')
...
match = re.fullmatch(COMMAND_REGEX, command)
command = list(dropwhile(lambda x: x is None, match.groups()))[0]
label = LABEL_MAP[command]
Just wondering why you need the mapping from output group to command in LABEL_MAP
?
The command
you are searching for is also in the text and retrievable via regex (r'ocrd process "([^" ]+)|(ocrd-import)'
).
Just wondering why you need the mapping from output group to command in
LABEL_MAP
?
Thecommand
you are searching for is also in the text and retrievable via regex (r'ocrd process "([^" ]+)|(ocrd-import)'
).
You are right, we could simplify things and get rid of LABEL_MAP
. Then again, the solution with COMMAND_REGEX
and LABEL_MAP
is more general, will work for many sorts of joblogs beside OCR-D (in my code base, both LABEL_MAP
and COMMAND_REGEX
are command-line arguments), and allows you to name the labels independently on the command names. For example, you could merge cis-ocropy-clip
, cis-ocropy-segment
, and cis-ocropy-dewarp
into a single slice named cis-ocropy
:
LABEL_MAP = {
# [...]
"OCR-D-BIN-DENOISE": "cis-ocropy",
"OCR-D-BIN-DENOISE-DESKEW": "cis-ocropy",
"OCR-D-SEG-REG-DESKEW": "cis-ocropy",
"OCR-D-SEG-REG-DESKEW-CLIP": "cis-ocropy",
"OCR-D-SEG-LINE": "cis-ocropy",
"OCR-D-SEG-LINE-RESEG-DEWARP": "cis-ocropy",
}
A GNU Parallel joblog may look like this:
You would run the gist as follows:
After running the gist, you would end up with this beautiful pie chart: