Last active
January 26, 2023 18:45
-
-
Save Mr0grog/5f96712f9cad07fac005963fc4366bc0 to your computer and use it in GitHub Desktop.
Summarize log files from EDGI Wayback imports
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import timedelta | |
import dateutil.parser | |
from pathlib import Path | |
import re | |
START_LINE = re.compile(r'^\[([^\]]+)\] Starting Internet Archive Import') | |
END_LINE = re.compile(r'^\s*Internet Archive import completed at (.+)') | |
SUMMARY_START = re.compile(r'^\s*Loaded (\d+) CDX records:') | |
SUMMARY_ITEM = re.compile(r'^\s*(\d+)\s([\s\w\-]+)\s\(') | |
IMPORT_ERRORS = re.compile(r'^\s*Total:\s*(\d+)\serrors') | |
def cleanline(line): | |
return line.strip('\n,.') | |
def summarize_log_file(file): | |
start_time = None | |
end_time = None | |
summary_lines = [] | |
import_errors = 0 | |
summary_lines_by_type = {'total': '', 'unknown errors': '', 'successes': ''} | |
summary = {'total': 0, 'unknown errors': 0, 'successes': 0} | |
## | |
# Logfiles start with a time: | |
# [Sat Nov 14 03:53:31 UTC 2020] Starting Internet Archive Import | |
# Then have lots of logs, then a summary like: | |
# Loaded 75244 CDX records: | |
# 28491 successes (37.86%), | |
# 45655 could not be played back (60.68%), | |
# 309 had no actual memento (0.41%), | |
# 146 unknown errors (0.19%). | |
# (Why did I put those commas and periods in there???) | |
# Optionally followed by a list of import errors: | |
# Import job errors: | |
# 77314: 3 errors ["Row 616: ..."] | |
# 77315: 1 errors ["Row 409: ..."] | |
# 77319: 2 errors ["Row 720: ..."] | |
# Total: 6 errors | |
# And ending with a time: | |
# Internet Archive import completed at Sat Nov 14 10:31:06 UTC 2020 | |
mode = 'start' | |
for line in file: | |
if not start_time: | |
start_match = START_LINE.match(line) | |
if start_match: | |
start_time = dateutil.parser.parse(start_match.group(1)) | |
mode = 'summary_search' | |
elif mode == 'summary_search': | |
start_match = SUMMARY_START.match(line) | |
if start_match: | |
summary_lines.append(cleanline(line)) | |
summary_lines_by_type['total'] = cleanline(line) | |
summary['total'] = int(start_match.group(1)) | |
mode = 'summary' | |
elif mode == 'summary': | |
is_summary_line = SUMMARY_ITEM.match(line) | |
if is_summary_line: | |
summary_lines.append(cleanline(line)) | |
summary_type = is_summary_line.group(2) | |
summary_lines_by_type[summary_type] = cleanline(line) | |
summary[summary_type] = int(is_summary_line.group(1)) | |
else: | |
mode = 'end' | |
elif mode == 'end': | |
total_match = IMPORT_ERRORS.match(line) | |
if total_match: | |
import_errors = int(total_match.group(1)) | |
continue | |
## | |
end_match = END_LINE.match(line) | |
if end_match: | |
end_time = dateutil.parser.parse(end_match.group(1).strip()) | |
break | |
## | |
if end_time and start_time: | |
total_time = end_time - start_time | |
else: | |
total_time = timedelta(0) | |
## | |
summary['time'] = total_time | |
summary['lines'] = summary_lines | |
summary['lines_by_type'] = summary_lines_by_type | |
summary['import_errors'] = import_errors | |
return summary | |
def summary_block(summary): | |
"""Print a complete summary as a block.""" | |
output_lines = [ | |
f'Time: {summary["time"]}', | |
*summary["lines"], | |
f'Import errors: {summary["import_errors"]}' | |
] | |
return "\n".join(output_lines) | |
def summarize_dir(logdir): | |
"""Get summaries for each file in the directory.""" | |
files = [logfile for logfile in logdir.iterdir() if logfile.is_file()] | |
files.sort() | |
summaries = [] | |
for logfile in files: | |
with logfile.open() as file: | |
summaries.append((logfile, summarize_log_file(file))) | |
return summaries | |
def table(headers, rows, delimiter=' '): | |
"""Print a nice table of data""" | |
all_rows = [headers, *rows] | |
sizes = [max(len(str(row[index])) for row in all_rows) | |
for index in range(len(headers))] | |
for row in all_rows: | |
text = '' | |
for index, value in enumerate(row): | |
if index == 0: | |
text += str(value).ljust(sizes[index]) | |
else: | |
text += f'{delimiter}{str(value).rjust(sizes[index])}' | |
print(text) | |
def table(headers, rows, delimiter=' ', markdown=False): | |
"""Print a nice table of data""" | |
all_rows = [headers, *rows] | |
sizes = [max(len(str(row[index])) for row in all_rows) | |
for index in range(len(headers))] | |
if markdown: | |
delimiter = ' | ' | |
row = [((sizes[index] - 1) * '-') + (index == 0 and '-' or ':') | |
for index, _ in enumerate(headers)] | |
all_rows.insert(1, row) | |
for row in all_rows: | |
text = '' | |
for index, value in enumerate(row): | |
if index == 0: | |
text += str(value).ljust(sizes[index]) | |
else: | |
text += f'{delimiter}{str(value).rjust(sizes[index])}' | |
if markdown: | |
text = f'| {text} |' | |
print(text) | |
def normalized_bars(values, size=10): | |
"""Create a horizontal bar chart as a list of strings.""" | |
most = max(values) | |
return [(round(size * x / most) * '█').ljust(size) | |
for x in values] | |
def summary_field(summary, key): | |
""" | |
Get a nice string for one of the summary fields, e.g. '536 (3.21%)'. | |
""" | |
total = summary['total'] | |
value = summary.get(key, 0) | |
if total: | |
ratio = 100 * value / total | |
return f'{value} ({ratio:.2f}%)' | |
else: | |
return '-' | |
def percentage(summary, key): | |
"""Get a summary field as a percentage string, e.g. '3.21%'.""" | |
total = summary['total'] | |
value = summary.get(key, 0) | |
if total: | |
ratio = 100 * value / total | |
return f'{ratio:.2f}%' | |
else: | |
return '-' | |
summaries = summarize_dir(Path('/var/log/cron-ia-import/')) | |
# Print summary blocks | |
for logfile, summary in summaries: | |
print(logfile) | |
print(summary_block(summary)) | |
print('') | |
# Print simple time summary | |
for logfile, summary in summaries: | |
print(f'{logfile}: {summary["time"]}') | |
# Fancy table with bars and so on | |
total_bars = normalized_bars([s['total'] for f, s in summaries], 15) | |
time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15) | |
table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Skipped', 'Errors', 'No Playback', 'No Memento'], | |
[( | |
logfile.stem[7:], # Slice the date from logfile.name, | |
time_bars[index], | |
summary['time'], | |
total_bars[index], | |
summary["total"], | |
percentage(summary, 'skipped - already in DB'), | |
percentage(summary, 'unknown errors'), | |
percentage(summary, 'could not be played back'), | |
percentage(summary, 'had no actual memento'), | |
) | |
for index, (logfile, summary) in enumerate(summaries)], | |
markdown=True) | |
# Bar charts for everything!!! | |
# A) This is not really that helpful | |
# B) The way I made the percentage bars is too clever and hard to follow, | |
# would not do again. | |
total_bars = normalized_bars([s['total'] for f, s in summaries], 15) | |
time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15) | |
unknown_error_bars = [" ".join(item) for item in zip( | |
[percentage(s, 'unknown errors') for f, s in summaries], | |
normalized_bars([s.get('unknown errors', 0) for f, s in summaries]) | |
)] | |
no_playback_bars = [" ".join(item) for item in zip( | |
[percentage(s, 'could not be played back') for f, s in summaries], | |
normalized_bars([s.get('could not be played back', 0) for f, s in summaries]) | |
)] | |
no_memento_bars = [" ".join(item) for item in zip( | |
[percentage(s, 'had no actual memento') for f, s in summaries], | |
normalized_bars([s.get('had no actual memento', 0) for f, s in summaries]) | |
)] | |
table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Errors', 'No Playback', 'No Memento'], | |
[( | |
logfile.stem[7:], # Slice the date from logfile.name, | |
time_bars[index], | |
summary['time'], | |
total_bars[index], | |
summary["total"], | |
unknown_error_bars[index], # percentage(summary, 'unknown errors'), | |
no_playback_bars[index], # percentage(summary, 'could not be played back'), | |
no_memento_bars[index], # percentage(summary, 'had no actual memento'), | |
) | |
for index, (logfile, summary) in enumerate(summaries)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment