Created
March 28, 2014 14:58
-
-
Save correl/9834783 to your computer and use it in GitHub Desktop.
Aggregate similar log file entries matching a search pattern
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import argparse | |
from itertools import ifilter | |
from fuzzywuzzy import fuzz | |
MATCH_THRESHOLD = 90 | |
def parse_file(filename, predicate=None): | |
with open(filename) as f: | |
return aggregate_errors(f, predicate) | |
def aggregate_errors(lines, predicate=None): | |
return reduce(store_similar, | |
ifilter(predicate, lines) if predicate else lines, | |
{}) | |
def store_similar(acc, string): | |
match = find_similar(acc, string) | |
if match: | |
acc[match["key"]].append((match["ratio"], string)) | |
else: | |
acc[string] = [(100, string)] | |
return acc | |
def find_similar(acc, string): | |
results = map(lambda s: (fuzz.token_set_ratio(s, string), s), | |
acc.iterkeys()) | |
filtered = filter(lambda r: r[0] >= MATCH_THRESHOLD, | |
results) | |
ranked = sorted(filtered) | |
return dict(zip(["ratio", "key"], ranked[-1])) if ranked else None | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("searchstring") | |
parser.add_argument("filename") | |
parser.add_argument("-t", "--threshold", | |
type=int, | |
help="Fuzzy match percentage threshold") | |
args = parser.parse_args() | |
if args.threshold: | |
MATCH_THRESHOLD = args.threshold | |
aggregated = parse_file(args.filename, | |
lambda line: args.searchstring in line) | |
for entry, matches in aggregated.iteritems(): | |
print("{:<10} {}".format(len(matches), entry)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment