Created
May 1, 2018 14:55
-
-
Save lawrencejones/93fd99d2ec4f2ab52d9d55d0323d74fd to your computer and use it in GitHub Desktop.
Use jaccard index to identify log lines that are similar to one another
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require "json" | |
=begin | |
[2018-04-30T00:00:57,316][WARN ][logstash.outputs.elasticsearch] Failed action. {:status=>400, :action=>["index", {:_id=>nil, :_index=>"logstash-2018.04.30", :_type=>"logs", :_routing=>nil}, 2018-04-30T00:00:57.000Z app01 usage_tracker], :response=>{"index"=>{"_index"=>"logstash-2018.04.30", "_type"=>"logs", "_id"=>"AWMT2jPQZUVqEe-MfAiF", "status"=>400, "error"=>{"type"=>"mapper_parsing_exception", "reason"=>"failed to parse [id]", "caused_by"=>{"type"=>"number_format_exception", "reason"=>"For input string: \"PM00096PW1GVDT\""}}}}}) | |
=end | |
threshold = (ARGV.first || "0.7").to_f | |
puts("Clustering with >#{threshold} Jaccard index...\n") | |
def find_similar(threshold, entry, entries) | |
entries.find { |candidate| jaccard_index(entry, candidate) > threshold } | |
end | |
def jaccard_index(a, b) | |
as, bs = a.split(/\W/), b.split(/\W/) | |
(as & bs).size / (as | bs).size.to_f | |
end | |
line_counts = STDIN.each_line.each_with_object({}.tap { |h| h.default = 0 }) do |line, counts| | |
next unless line = line.scan(/:response\=>.+/).first | |
similar = find_similar(threshold, line, counts.keys) || line | |
counts[similar] += 1 | |
end | |
puts( | |
"The problematic keys are: \n", | |
line_counts.map { |line, _| line.scan(/\[\S+\]/).first }.compact.sort, | |
"\n", | |
) | |
line_counts.sort_by(&:last).reverse.each do |line, count| | |
puts(<<~MSG) | |
count=#{count} | |
key=#{line.scan(/\[\S+\]/).first} | |
sample=#{line} | |
MSG | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment