Last active
November 14, 2017 22:05
-
-
Save pmeyerson/fefea9c608bf2de37661e7a0a49f0ebc to your computer and use it in GitHub Desktop.
parse exchange 2010 smtp receive connector logs into csv format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import csv | |
import sys | |
logdir="exhub_logs/" | |
outdir="output/" | |
# get list of input log files | |
#file format is: #Fields: date-time,connector-id,session-id,sequence-number,local-endpoint,remote-endpoint,event,data,context | |
# note that CONTEXT field can contain additional commas ',' | |
# Event = "-" -> session ends; "+"-> session start, ">" or "<" echo of received or sent | |
# "*" -> explanatory log output (?) | |
input_headers=["date-time","connector-id","session-id","sequence-number","local-endpoint","remote-endpoint","event","data"] | |
output_headers=["date-time","connector-id","session-id","local-endpoint","remote-endpoint","data"] | |
filelist = [] | |
eventlist={} | |
cleanevents=[] | |
partialevents={} #for session data spanning two input files | |
cleanchars=['\'','[',']'] | |
throwout = {} #track incomplete sessions | |
session_count =0 | |
partialevents.clear() | |
for root, dirs, files in os.walk(logdir): | |
for filename in sorted(files): | |
filelist.append(filename) | |
print ("found " + str(len(filelist)) + " log files to parse\n") | |
for file in filelist: | |
# | |
print("opening file "+ file) | |
with open(logdir+file) as openfile: | |
for ln in openfile: | |
# parse one line at a time | |
if (("#Software:" in ln) or ("#Version:" in ln) or ("#Log-type:" in ln) or ("#Date:" in ln) or ("#Fields:" in ln)): | |
continue #skip these lines | |
else: | |
# grab first 7 fields via comma delim | |
# print("line starts with "+ln[0][::2]) | |
# remove any quotes | |
for ch in ['\r\n','\"']: | |
if ch in ln: | |
ln=ln.replace(ch,'') | |
try: | |
line_parts=ln.split(',') | |
if len(line_parts) > 5: #ensure malformed data does not bork script! | |
session_id=line_parts[2] | |
if '+' in line_parts[6]: | |
# First sequence for new session | |
partialevents[session_id] = line_parts[0:7] | |
elif '-' in line_parts[6]: | |
# closing sequence in session, move to eventlist dict IF present | |
if session_id in partialevents: | |
eventlist[session_id]=partialevents[session_id] | |
del partialevents[session_id] | |
eventlist[session_id].append(line_parts[6::]) | |
else: | |
partialevents[session_id]=line_parts | |
elif ">" in line_parts[6] or "<" in line_parts[6]: | |
# append sequence to existing session if present | |
if session_id in partialevents: | |
partialevents[session_id].append(line_parts[6::]) | |
elif len(line_parts) < 3: | |
#print("data seems malformed, and no session ID " + ln) | |
print("no session id? + " + str(filename) + ln) | |
if "no_session_id" in throwout: | |
throwout['no_session_id'].append(ln) | |
else: | |
throwout['no_session_id'] =[ln] | |
continue | |
else: | |
#if len(line_parts) !<3... (no session_ID) | |
print("no session id") | |
print(ln+'\n') | |
except: | |
exc_type, exc_obj, exc_tb = sys.exc_info() | |
print("error " + str(exc_type) + " at line " + str(exc_tb.tb_lineno)) | |
print(ln) | |
exit(1) | |
print("finished parsing") | |
# at this point, we need to convert from a single csv string followed by a list, to a single csv string. | |
# also remove some punt and replace others with a single space to neaten up | |
#for sequence_id in eventlist.keys(): | |
# event_start = eventlist[sequence_id][0].strip(',')+',' | |
# txt = str(eventlist[sequence_id][1::]) | |
#for ch in cleanchars: | |
# if ch in txt: | |
# txt = txt.replace(ch, '') | |
#if ',' in txt: | |
# txt = txt.replace(',', ' ') | |
#if ' ' in txt: | |
# txt = txt.replace(' ', ' ') | |
#event_last = txt | |
#cleanevents.append(event_start+event_last) | |
print("\ngetting ready to write file\n") | |
with open(outdir+file+'.csv','w') as csvfile: | |
writer = csv.writer(csvfile, lineterminator='\n') | |
writer.writerow(output_headers) | |
for sequence_id in eventlist: | |
csvfile.write(sequence_id+'\n') | |
#csvfile.write(sequence_id[1]) | |
# Cleanup data structures to prepare for next file parse | |
session_count += len(eventlist) | |
eventlist.clear() | |
cleanevents=[] | |
print("will start next file with "+str(len(partialevents))+" incomplete sessions") | |
#print eventlist #end of loop for file | |
# at this point we should have a dict of sequence-number/events by sequence-id | |
print("\nfinished wrote " + str(session_count) + " smtp sessions across all output files\n") | |
print("found " + str(len(partialevents)) + " incomplete sessions" ) | |
print(partialevents) | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment