Last active
December 20, 2015 15:59
-
-
Save mbrownnycnyc/6158144 to your computer and use it in GitHub Desktop.
python script that parses the output of the argus-client radump()'s dns printer and inesrts interesting things into a DB. See DB definition at https://gist.github.com/mbrownnycnyc/6083357
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
''' | |
You must create a config file with: | |
grep -v ^# /root/.rarc | grep -v ^$ > ~/for_dnsdb.rarc && if grep ^RA_TIME_FORMAT ~/for_dnsdb.rarc > /dev/null ; then sed s/^RA_TIME_FORMAT/#RA_TIME_FORMAT/g -i ~/for_dnsdb.rarc && echo -e "RA_TIME_FORMAT=\"%Y-%m-%d %T.%f\"\nRA_PRINT_LABELS=-1\nRA_FIELD_DELIMITER='^'" >> ~/for_dnsdb.rarc ; fi | |
expecting the input to be from: | |
radump -F ~/for_dnsdb.rarc -r argus_10\:00\:00.gz -s seq ltime saddr daddr suser:1024 duser:1024 - port domain | |
or | |
radump -F ~/for_dnsdb.rarc -S 127.0.0.1:561 -s seq ltime saddr daddr suser:1024 duser:1024 - port domain | |
as stdin like... | |
radump -F ~/for_dnsdb.rarc -S 127.0.0.1:561 -s seq ltime saddr daddr suser:1024 duser:1024 - port domain | radump_to_dns_db.py | |
or to drop to the background without any output: | |
nohup radump -F ~/for_dnsdb.rarc -S 127.0.0.1:561 -s seq ltime saddr daddr suser:1024 duser:1024 - port domain | python -u radump_to_dns_db.py &> /dev/null & | |
or to drop the background and save all output to a log file (watch out for reportrecordcount boolean): | |
nohup radump -F ~/for_dnsdb.rarc -S 127.0.0.1:561 -s seq ltime saddr daddr suser:1024 duser:1024 - port domain | python -u radump_to_dns_db.py &> /var/log/radump2dnsdb.log & | |
or to drop to the background saving only stderr to a log file: | |
nohup radump -F ~/for_dnsdb.rarc -S 127.0.0.1:561 -s seq ltime saddr daddr suser:1024 duser:1024 - port domain | python -u radump_to_dns_db.py 1> /dev/null 2> /var/log/radump2dnsdb.log & | |
considers: | |
- no labels (aka `-L -1` or `RA_PRINT_LABELS=-1`) | |
- field seperator is '^' (`-c ^` or `RA_FIELD_DELIMITER='^') | |
- time format is %Y-%m-%d %T.%f (as RA_TIME_FORMAT="%Y-%m-%d %T.%f") | |
with support for -vv will come support for additional field: | |
nsnames [this will operate on N index]: varchar(256)... will be a comma-separated list | |
To handle the case when the DNS question and the DNS answers are split between flushed userdata, the seq is inserted so that the user can reference this to locate the question and answer pair. | |
- If you see 0 as the value for atype_id or aname_host_id, then the question was missing from the `suser` (truly, there was 0 bytes of suser data). | |
- If you see 0 as the value for qtype_id or qname_host_id, then the answer was missing from the `duser` (truly, there was 0 bytes of duser data). | |
''' | |
# set booleans to either 'True' or 'False' (explicit case, without quotes) | |
reportrecordcount = True #this controls whether or not the number of records processed should be printed to stdout | |
recordcount = 0 #this is a global int to track how many records have been processed | |
debug = False #this controls whether or not some verbose info is printed throughout processing | |
onerecord = False #this controls whether or not the script proceeds past processing a single record (useful for debugging) | |
import sys, traceback, types | |
import string, re | |
import time | |
from datetime import datetime | |
def SQLQuery(query, fetchtype): | |
import MySQLdb | |
#http://www.tutorialspoint.com/python/python_database_access.htm | |
#http://www.packtpub.com/article/exception-handling-mysql-python | |
sqlserver = "localhost" | |
if debug: print 'Connecting to %s' %sqlserver | |
db = MySQLdb.connect(sqlserver,"root","passwordhere","argus_dnsdb" ) | |
if debug: print 'Connected... creating db cursor' | |
cursor = db.cursor() | |
try: | |
if debug: print 'Executing query: ' + query | |
cursor.execute(query) | |
if debug: print 'Execution successful' | |
if fetchtype == "insert": | |
db.commit() | |
results = 0 | |
if fetchtype == "fetchall": | |
results = cursor.fetchall() #fetchall returns a tuple of tuples of rows of results | |
if fetchtype == "fetchone": | |
results = cursor.fetchone() #fetchone returns a tuple of the next row of results (this is probably not the nicest way to do this, but it should work, as our SELECTs rely on there being only one result. | |
except MySQLdb.Error, e: | |
print 'Executing query: ' + query | |
print "Error: %s" %e | |
db.rollback() | |
sys.exit() | |
return 0 | |
db.close() | |
return results | |
def FlowRecordProcessor(flow_seq, query_time, nsclient, nsserver, qtype, qname, atypes, anames): | |
#here we might want to multithread using http://docs.python.org/2/library/multiprocessing.html?highlight=multiprocessing#multiprocessing | |
# or we might want to multithread calling FlowRecordProcessor() calls | |
#first we'll check if nsclient is contained in nsclients, and if so, get the id | |
tresult = SQLQuery('select id, ipv4addr from nsclients where ipv4addr=INET_ATON(\'' + nsclient + '\');', 'fetchone') | |
if type(tresult) is not types.NoneType: | |
#then the nsclient is already in the DB | |
nsclient_id = tresult[0] | |
else: | |
#then the nsclient isn't in the DB | |
SQLQuery('insert into nsclients (ipv4addr) values (INET_ATON(\'' + nsclient + '\'));', 'insert') | |
nsclient_id = SQLQuery('select id, ipv4addr from nsclients where ipv4addr=INET_ATON(\'' + nsclient + '\');', 'fetchone') | |
nsclient_id = nsclient_id[0] | |
#next we'll check if nsserver is contained in nsserver, and if so, get the id | |
tresult = SQLQuery('select id, ipv4addr from nsservers where ipv4addr=INET_ATON(\'' + nsserver + '\');', 'fetchone') | |
if type(tresult) is not types.NoneType: | |
#then the nsserver is already in the DB | |
nsserver_id = tresult[0] | |
else: | |
#then the nsserver isn't in the DB | |
SQLQuery('insert into nsservers (ipv4addr) values (INET_ATON(\'' + nsserver + '\'));', 'insert') | |
nsserver_id = SQLQuery('select id, ipv4addr from nsservers where ipv4addr=INET_ATON(\'' + nsserver + '\');', 'fetchone') | |
nsserver_id = nsserver_id[0] | |
#next we'll get the id for the qtype and qname: | |
if (qtype == 0) and (qname == 0): | |
#then there was no suser data provided | |
qname_host_id = 0 | |
qtype_nstype_id = 0 | |
else: | |
#next we'll get the id for the qtype: | |
tresults = SQLQuery('select id, nstype from nstypes;', 'fetchall') | |
#thanks: http://stackoverflow.com/a/2917399/843000 | |
qtype_nstype_id = [x[0] for x in tresults if x[1] == qtype][0] | |
#next we'll get the id for the host of the server that was inquired about | |
tresults = SQLQuery('select id from hosts where host=\'' + qname + '\';', 'fetchone') | |
if (type(tresults) is not types.NoneType) and (len(tresults) > 0): | |
#then the host is already in the DB | |
qname_host_id = tresults[0] | |
else: | |
#then the host isn't in the DB | |
SQLQuery('insert into hosts (host) values (\'' + qname + '\');', 'insert') | |
qname_host_id = SQLQuery('select id from hosts where host=\'' + qname + '\';', 'fetchone') | |
qname_host_id = qname_host_id[0] | |
#next we'll deal with resolving the atypes and anames to their corresponding ids from nstypes and hosts respectively | |
atypes_nstype_ids = [] | |
anames_ids = [] | |
if (0 in atypes) and (0 in anames): | |
atypes_nstype_ids.append(0) | |
anames_ids.append(0) | |
else: | |
if len(atypes) == len(anames): | |
#this should always be the case | |
for i in range(0,len(atypes)): | |
tresults = SQLQuery('select id, nstype from nstypes;', 'fetchall') | |
atypes_nstype_ids.append([x[0] for x in tresults if x[1] == atypes[i]][0]) | |
for i in range(0,len(anames)): | |
#some times the anames[i] will be an ip address. | |
#some times the anames[i] will be a name | |
## depending on the atype[i] as described in RFCs | |
tresults = SQLQuery('select id, host from hosts where host=\'' + anames[i] + '\';', 'fetchall') | |
if (len(tresults) > 0): | |
#then the host is already in the DB | |
anames_ids.append([x[0] for x in tresults if x[1] == anames[i]][0]) | |
else: | |
#then the host isn't in the DB | |
SQLQuery('insert into hosts (host) values (\'' + anames[i] + '\');', 'insert') | |
anames_ids.append(SQLQuery('select id, host from hosts where host=\'' + anames[i] + '\';', 'fetchone')[0]) | |
# here we will INSERT the record into the main DB | |
## each flow_seq has the possibility to have many atypes and anames associated, so we'll be inserting on an atypes basis | |
## we consider flow_seq+query_time to be unique universally... http://thread.gmane.org/gmane.network.argus/9738/focus=9750 | |
if len(atypes) == len(anames_ids): | |
for i in range(0,len(atypes_nstype_ids)): | |
if (type(query_time) is types.StringType): | |
if debug: print "query_time is of type " + str(type(query_time)) + " and will be converted" | |
query_time = time.mktime(datetime.strptime(query_time,'%Y-%m-%d %H:%M:%S.%f').timetuple()) | |
else: | |
if debug: print "query_time is of another type " + str(type(query_time)) + " and will not be converted." | |
if debug: | |
print "type(flow_seq): " + str(type(flow_seq)) + ": " + flow_seq | |
print "type(query_time): " + str(type(query_time)) + ": " + str(query_time) | |
print "type(nsclient_id): " + str(type(nsclient_id)) + ": " + str(nsclient_id) | |
print "type(nsserver_id): " + str(type(nsserver_id)) + ": " + str(nsserver_id) | |
print "type(qtype_nstype_id): " + str(type(qtype_nstype_id)) + ": " + str(qtype_nstype_id) | |
print "type(qname_host_id): " + str(type(qname_host_id)) + ": " + str(qname_host_id) | |
print "type(atypes_nstype_ids[" + str(i) + "]): " + str(type(atypes_nstype_ids[i])) + ": " + str(atypes_nstype_ids[i]) | |
print "type(anames_ids[" + str(i) + "]): " + str(type(anames_ids[i])) + ": " + str(anames_ids[i]) | |
#should I check to make sure the flow isn't already in the main table? Right now I don't. | |
SQLQuery("insert into main (flow_uid,flow_seq,qtime,nsclient_id,nsserver_id,qtype_id,qname_host_id,atype_id,aname_host_id) " + \ | |
"values ('" + str(float(flow_seq)+query_time) + "','" + str(float(flow_seq)) + "','" + str(query_time) + "','" + str(nsclient_id) + "','" + str(nsserver_id) + "','" + \ | |
str(qtype_nstype_id) + "','" + str(qname_host_id) + "','" + str(atypes_nstype_ids[i]) + "','" + str(anames_ids[i]) + "');", 'insert') | |
global reportrecordcount | |
if reportrecordcount: | |
global recordcount | |
recordcount = recordcount + 1 | |
sys.stdout.write("\r"+"DNS questions and answers stored: " + str(recordcount)) | |
sys.stdout.flush() | |
if __name__ == "__main__": | |
if reportrecordcount: | |
sys.stdout.write("\r"+"DNS questions and answers stored: " + str(recordcount)) | |
sys.stdout.flush() | |
#f = open('C:\\Documents and Settings\\mbrown\\Desktop\\radump_dns_to_sql\\radump.out', 'r') | |
f = sys.stdin | |
#there is another method of reading data from stdin: http://stackoverflow.com/a/7056462/843000 | |
#I considered building an object to pass to a function, but that just seems latent. | |
for line in f: | |
try: | |
if debug: | |
print "---begin record---" | |
print line | |
#check if the line contains s[0] or d[0] | |
#that way we can handle missing suser or duser data by providing the fields 0 for ID in main and bypassing the SQL lookups | |
if re.search(r's\[0\]', line) == None: | |
if debug: print "suser contains more than 0 data" | |
suserzero = False | |
else: | |
#then we have no suser data | |
suserzero = True | |
if debug: print "suser contains 0 data" | |
if re.search(r'd\[0\]', line) == None: | |
if debug: print "duser contains more than 0 data" | |
duserzero = False | |
else: | |
#then we have no duserdata | |
duserzero = True | |
if debug: print "duser contains 0 data" | |
#Each field is split by '^' | |
fields = line.split('^') | |
#the first field is the sequence number for the flow. This is a unique ID across the time `argus` is running. | |
flow_seq = fields[0].strip() | |
#The second field is the last flow time, so this will be the time we'd like to insert into the DB. It will be formatted to be acceptable by strftime(). | |
query_time = fields[1].strip() | |
#the third field is the source address (the dns client) | |
nsclient = fields[2].strip() | |
#the fourth field is the destination address (the dns server) | |
nsserver = fields[3].strip() | |
#the fifth field is source user data string: | |
''' | |
- we can take the data that's only in between the double quotes | |
- we can then parse this as follows: | |
- third space separated string is a TYPE followed by a '?'. This will be the qtype | |
- fourth space separate string is the qname | |
''' | |
if suserzero: | |
qtype = 0 | |
qname = 0 | |
else: | |
suserdata = re.search(r'".*"',fields[4].strip()) | |
subsetsuserdata = suserdata.group(0).split() | |
#declaring a variable is just a pointer | |
qtypetemp = subsetsuserdata[2] | |
if (qtypetemp[len(qtypetemp)-1] == "?"): | |
#then we have a valid question | |
qtype = qtypetemp[:(len(qtypetemp)-1)] | |
qnametemp = subsetsuserdata[3] | |
if (qnametemp[len(qnametemp)-1] == "."): | |
#then we (still) have a valid question | |
qname = qnametemp[:(len(qnametemp)-1)] | |
#the sixth field is the destination user data string: | |
''' | |
- we can take the data that's only in between the double quotes | |
- we can then parse this as follows: | |
- the third space separated string is the beginning of a command separated list that ends with ' (' | |
''' | |
atypes = [] | |
anames = [] | |
if duserzero: | |
atypes.append(0) | |
anames.append(0) | |
else: | |
#duserdata = re.search(r'".*"',fields[4].strip()) | |
#taken from printer-domain.c | |
subsetduserdata = re.findall(r'[A|NS|MD|MF|CNAME|SOA|MB|MG|MR|NULL|WKS|PTR|HINFO|MINFO|MX|TXT|RP|AFSDB|X25|ISDN|RT|NSAP|NSAP_PTR|SIG|KEY|PX|GPOS|AAAA|LOC|NXT|EID|NIMLOC|SRV|ATMA|NAPTR|A6|DNAME|OPT|UINFO|UID|GID|UNSPEC|UNSPECA|TKEY|TSIG|IXFR|AXFR|MAILB|MAILA|ANY]\s\d{,3}\.\d{,3}\.\d{,3}\.\d{,3}',fields[5].strip()) | |
if len(subsetduserdata)>0: | |
for recordset in subsetduserdata: | |
rssplit = recordset.split() | |
atypes.append(rssplit[0]) #this is the answer qtype | |
anames.append(rssplit[1]) #this is the answer qtype parameter | |
#this would block so we may want to consider a multiprocessor here... or within FlowRecordProcessor() | |
FlowRecordProcessor(flow_seq,query_time, nsclient, nsserver, qtype, qname, atypes, anames) | |
except: | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
print "*** print_tb:" | |
traceback.print_tb(exc_traceback, limit=1, file=sys.stdout) | |
print "*** print_exception:" | |
traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) | |
print "*** print_exc:" | |
traceback.print_exc() | |
print "*** format_exc, first and last line:" | |
formatted_lines = traceback.format_exc().splitlines() | |
print formatted_lines[0] | |
print formatted_lines[-1] | |
print "*** format_exception:" | |
print repr(traceback.format_exception(exc_type, exc_value, exc_traceback)) | |
print "*** extract_tb:" | |
print repr(traceback.extract_tb(exc_traceback)) | |
print "*** format_tb:" | |
print repr(traceback.format_tb(exc_traceback)) | |
print "*** tb_lineno:", exc_traceback.tb_lineno | |
print 'exiting' | |
sys.exit() | |
if onerecord: sys.exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment