Created
June 29, 2015 12:00
-
-
Save benbramley/18f941592e25e0816aad to your computer and use it in GitHub Desktop.
Fuzzy lookup in Splunk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv,sys | |
from fuzzywuzzy import fuzz | |
LOOKUPCSV = 'somelookup.csv' | |
RATIOTHRESHOLD = 80 | |
def output_results(results, mvdelim = '\n', output = sys.stdout): | |
"""Given a list of dictionaries, each representing | |
a single result, and an optional list of fields, | |
output those results to stdout for consumption by the | |
Splunk pipeline""" | |
# We collect all the unique field names, as well as | |
# convert all multivalue keys to the right form | |
fields = set() | |
for result in results: | |
for key in result.keys(): | |
if(isinstance(result[key], list)): | |
result['__mv_' + key] = encode_mv(result[key]) | |
result[key] = mvdelim.join(result[key]) | |
fields.update(result.keys()) | |
# convert the fields into a list and create a CSV writer | |
# to output to stdout | |
fields = sorted(list(fields)) | |
writer = csv.DictWriter(output, fields) | |
# Write out the fields, and then the actual results | |
writer.writerow(dict(zip(fields, fields))) | |
writer.writerows(results) | |
def encode_mv(vals): | |
"""For multivalues, values are wrapped in '$' and separated using ';' | |
Literal '$' values are represented with '$$'""" | |
s = "" | |
for val in vals: | |
val = val.replace('$', '$$') | |
if len(s) > 0: | |
s += ';' | |
s += '$' + val + '$' | |
return s | |
def fuzzylookup(key,value,csvfile,threshold): | |
csvfile.seek(0) | |
csvlookup = csv.DictReader(csvfile) | |
for line in csvlookup: | |
if (fuzz.ratio(value,line[key]) > threshold): | |
return line | |
def main(input, output, argv): | |
csv_in = csv.DictReader(input) | |
threshold = RATIOTHRESHOLD | |
csvfile = open('../lookups/' + LOOKUPCSV) | |
result = [] | |
for row in csv_in: | |
# Automatically get the input field we are looking up - Splunk sends all fields but we just want the one with values set | |
for (k,v) in row.items(): | |
if v: | |
field = k | |
key = field | |
value = row[field] | |
# Fuzzy lookup | |
results = fuzzylookup(key,value,csvfile,threshold) | |
if results: | |
results[key] = value | |
result.append(results) | |
output_results(result) | |
if __name__ == '__main__': | |
main(sys.stdin, sys.stdout, sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment