-
-
Save Witiko/1f92c84b030f7ed2e5ff2b67a4710409 to your computer and use it in GitHub Desktop.
| # -*- coding:utf-8 -*- | |
| from itertools import dropwhile | |
| import json | |
| import re | |
| import sys | |
| import matplotlib.pyplot as plt | |
| JOBLOG_FILENAME = sys.argv[1] | |
| OUTPUT_FILENAME = sys.argv[2] | |
| COMMAND_REGEX = re.compile(r'.* -O ([^" ]*).*|.* (ocrd-import) .*') | |
| LABEL_MAP = { | |
| "ocrd-import": "ocrd-import", | |
| "OCR-D-BIN": "olena-binarize", | |
| "OCR-D-CROP": "anybaseocr-crop", | |
| "OCR-D-BIN2": "olena-binarize", | |
| "OCR-D-BIN-DENOISE": "cis-ocropy-denoise", | |
| "OCR-D-BIN-DENOISE-DESKEW": "cis-ocropy-deskew", | |
| "OCR-D-SEG-REG": "tesserocr-segment-region", | |
| "OCR-D-SEG-REPAIR": "segment-repair", | |
| "OCR-D-SEG-REG-DESKEW": "cis-ocropy-deskew", | |
| "OCR-D-SEG-REG-DESKEW-CLIP": "cis-ocropy-clip", | |
| "OCR-D-SEG-LINE": "cis-ocropy-segment", | |
| "OCR-D-SEG-REPAIR-LINE": "segment-repair", | |
| "OCR-D-SEG-LINE-RESEG-DEWARP": "cis-ocropy-dewarp", | |
| "OCR-D-OCR": "calamari-recognize", | |
| "OCR-D-TEXT": "fileformat-transform", | |
| "OCR-D-HOCR": "fileformat-transform", | |
| } | |
| def read_joblog(filename): | |
| successfully_read = 0 | |
| with open(filename, 'rt') as f: | |
| lines = iter(f) | |
| next(lines) | |
| for line in lines: | |
| line = line.rstrip('\r\n').split('\t') | |
| runtime = float(line[3]) | |
| exit_code = int(line[6]) | |
| command = line[-1] | |
| successfully_read += 1 | |
| yield (command, runtime, exit_code) | |
| def evaluate(): | |
| runtimes = {} | |
| joblog = read_joblog(JOBLOG_FILENAME) | |
| for command, runtime, exit_code in joblog: | |
| if exit_code != 0: | |
| continue | |
| match = re.fullmatch(COMMAND_REGEX, command) | |
| command = list(dropwhile(lambda x: x is None, match.groups()))[0] | |
| label = LABEL_MAP[command] | |
| if label not in runtimes: | |
| runtimes[label] = [] | |
| runtimes[label].append(runtime) | |
| return runtimes | |
| def pie_chart(runtimes): | |
| labels, sizes = zip(*runtimes.items()) | |
| sizes = list(map(sum, sizes)) | |
| colors = plt.get_cmap('tab20').colors | |
| patches, texts = plt.pie(sizes, colors=colors) | |
| plt.legend(patches, labels, loc='best') | |
| plt.axis('equal') | |
| plt.tight_layout() | |
| plt.savefig(OUTPUT_FILENAME) | |
| def main(): | |
| runtimes = evaluate() | |
| pie_chart(runtimes) | |
| if __name__ == '__main__': | |
| main() |
COMMAND_REGEX = re.compile(r'.* -O ([^" ]*).*|.* (ocrd-import) .*')
...
match = re.fullmatch(COMMAND_REGEX, command)
command = list(dropwhile(lambda x: x is None, match.groups()))[0]
label = LABEL_MAP[command]Just wondering why you need the mapping from output group to command in LABEL_MAP?
The command you are searching for is also in the text and retrievable via regex (r'ocrd process "([^" ]+)|(ocrd-import)').
Just wondering why you need the mapping from output group to command in
LABEL_MAP?
Thecommandyou are searching for is also in the text and retrievable via regex (r'ocrd process "([^" ]+)|(ocrd-import)').
You are right, we could simplify things and get rid of LABEL_MAP. Then again, the solution with COMMAND_REGEX and LABEL_MAP is more general, will work for many sorts of joblogs beside OCR-D (in my code base, both LABEL_MAP and COMMAND_REGEX are command-line arguments), and allows you to name the labels independently on the command names. For example, you could merge cis-ocropy-clip, cis-ocropy-segment, and cis-ocropy-dewarp into a single slice named cis-ocropy:
LABEL_MAP = {
# [...]
"OCR-D-BIN-DENOISE": "cis-ocropy",
"OCR-D-BIN-DENOISE-DESKEW": "cis-ocropy",
"OCR-D-SEG-REG-DESKEW": "cis-ocropy",
"OCR-D-SEG-REG-DESKEW-CLIP": "cis-ocropy",
"OCR-D-SEG-LINE": "cis-ocropy",
"OCR-D-SEG-LINE-RESEG-DEWARP": "cis-ocropy",
}
A GNU Parallel joblog may look like this:
You would run the gist as follows:
After running the gist, you would end up with this beautiful pie chart: