Example usage:
journalctl --unit=NetworkManager --lines=500 | cut --bytes=72- | python log_markov_chain.py
The cut
takes off the timestamp, so the logs actually contain verbatim repitions of some lines [A, B, C, A, ...]
import sys | |
import collections | |
import subprocess | |
import graphviz | |
raw_output = sys.stdin.read() | |
lines = [line for line in raw_output.strip().split("\n")] | |
possible_predecessors = {} | |
# If possible_predecessors[a] = [b0, b1, ...], then a is known to follow b0 or b1. | |
for first_line, second_line in zip(lines[:-1], lines[1:]): | |
possible_predecessors.setdefault(second_line, []).append(first_line) | |
new_lines = [""] | |
for line in lines: | |
if len(possible_predecessors[line]) == 1: | |
# this line always follows the previous one | |
# so we can combine the nodes | |
new_lines[-1] += "\n" + line | |
else: | |
# this line has multiple predecessors | |
# so we make it a new node | |
new_lines.append(line) | |
# If new_lines = [a + "\n" + b, c, d, c, ...], then b always follows a (they are combined) but c and d can't be combined. | |
# The original log was a b c d c | |
# Count the edges | |
# if edges[a, b] = n, then a b occurs n times in the logs | |
edges = collections.Counter(zip(new_lines[:-1], new_lines[1:])) | |
# Graphviz doesn't like colons!! | |
sanitize = lambda stri: stri.strip().replace(":", "") | |
dot = graphviz.Digraph() | |
for (msg0, msg1), count in edges.most_common(): | |
dot.edge(sanitize(msg0), sanitize(msg1), penwidth=str(count)) | |
try: | |
print("Open", dot.render(engine="dot", format="svg")) | |
except graphviz.backend.execute.CalledProcessError: | |
print("Some graphviz illegal character. Check this output:") | |
print("".join(dot.body[:10])) |