Skip to content

Instantly share code, notes, and snippets.

@kylebrandt
Last active August 29, 2015 14:22
Show Gist options
  • Save kylebrandt/404d1b17fa68373cd2fc to your computer and use it in GitHub Desktop.
Save kylebrandt/404d1b17fa68373cd2fc to your computer and use it in GitHub Desktop.
No promises...
#!/usr/bin/python
#Parse Nginix Logs and insert results into either MySQL or SQLite
#Then run various reports on the data
#Kyle Brandt 2010
import re, optparse
import cProfile
from datetime import datetime
from time import time
from sqlalchemy import create_engine
from sqlalchemy import schema, types
import locale
line_regex = re.compile(
r'''
(?P<lb> \S* )
\s
(?P<ip> (\d{1,3}\.){3}\d{1,3} )
\s-\s-\s\[
(?P<datetime> \S* )
\s
(?P<timezone> .?\d{4} )
\]
\s
"
(?P<method> \S* )
\s
(?P<uri> \S* )
\s
(?P<version> HTTP/[0-9\.]* )
"
\s
(?P<status_code> \d{3})
\s
(?P<bytes_read> \d+ )
\s
"
(?P<unknown_1> [^"]* )
"
\s
"
(?P<unknown_2> [^"]* )
"
\s
(?P<unknown_3> \d* )
\s
(?P<unknown_4> \d* )
\s
"
(?P<front_end> [^"]* )
"
\s
"
(?P<back_end> [^"]* )
"
\s
"
(?P<server> [^"]* )
"
\s
(?P<tq> -?\d* )
\s
(?P<tw> -?\d* )
\s
(?P<tc> -?\d* )
\s
(?P<tr> -?\d* )
\s
(?P<tt> -?\d* )
\s
"
(?P<term_state> [^"]* )
"
\s
(?P<unknown_10> \d* )
\s
(?P<unknown_11> \d* )
\s
(?P<unknown_12> \d* )
\s
(?P<unknown_13> \d* )
\s
(?P<unknown_14> \d* )
\s
(?P<unknown_15> \d* )
\s
(?P<unknown_16> \d* )
\s
"
(?P<unknown_17> [^"]* )
"
\s
"
(?P<unknown_18> [^"]* )
"
\s
"
(?P<referer> [^"]* )
"
\s
"
(?P<user_agent> [^"]* )
"
\s
"
(?P<site> [^"]* )
"
\s
"
(?P<forward_for> [^"]* )
"
''', re.VERBOSE)
#Parse Command Line Options
usage='''usage: %prog [options]... [logfile(s)]...
Example: python nginxLogReport.py --use-mysql --db-name nginix_log --db-user nginx \\
--db-pass superSecret sample.log > test.html'''
cmd_parser = optparse.OptionParser(
description="Command line utility that parses Nginx logs and generates some reports",
prog="nginxLogReport", usage=usage)
format_group = optparse.OptionGroup(cmd_parser, "Formatting Related Options")
db_group = optparse.OptionGroup(cmd_parser, "Database Related Options")
format_group.add_option('--no-html', action='store_false', dest='html_report',
default=True, help='Do not output report in HTML. '
"Might want to pipe to column -t -s $'\\t'")
format_group.add_option('--no-units', action='store_false', dest='human',
default=True, help='Do not output numbers with commas and bytes units (ie 4M, 3.2K)')
format_group.add_option('--items-per-report', '-i', action='store', nargs=1, dest='nitems',
default=20, type='int', help='Max Number of lines For each report, default is 20')
format_group.add_option('--errors', '-e', action='store_true', dest='errors',
default=False, help='Print any lines that failed to parse.')
format_group.add_option('--ips-only', action='store_true', dest='ip_only',
default=False,
help='When using text-only output, only print IP address for reports that print IP addresses.'
' Only works with reports ip-agent and ip-agent-hourly.')
cmd_parser.add_option_group(format_group)
cmd_parser.add_option('--site', action='store', dest='site', default='all',
help='The site to report on. Default is "all", '
"but it can also be any one site, for example "
'"serverfault.com"')
cmd_parser.add_option('--site-contains', action='store', dest='sitecontains', default='disabled',
help='substring of site(s) to report on. Default is disabled. Do not use with site option')
db_group.add_option('--sqlite', action='store', dest='sqlite_file',default='',
help="Use sqlite. Currently no working standard deviation function. "
"Filename is the argument, don't use any of the --db options with this. "
"Argument may also be 'memory', which means don't make a db file "
"but rather do it all in memory.")
db_group.add_option('--use-mysql', action='store_true', dest='use_mysql',
default=False, help='Use MySQL')
db_group.add_option('--db-user', '-u', action='store', dest='dbuser',
default='', help='User of the MySQL Database')
db_group.add_option('--db-pass', '-p', action='store', dest='dbpass',
default='', help='Password for the specified user of the MySQL Database')
db_group.add_option('--db-host', action='store', dest='dbhost',
default='localhost', help='Host of the MySQL Database. Default is localhost')
db_group.add_option('--db-name', action='store', dest='dbname',
default='', help='Name of the MySQL Database.')
db_group.add_option('--no-drop', action='store_false', dest='drop',
default=True, help='Do not drop the table when done')
db_group.add_option('--drop', action='store_true', dest='drop_table',
default=False, help='Only Drop the table and exit')
db_group.add_option('--db-table', action='store', dest='log_table', default='log_table',
help='Name of table for this parse')
format_group.add_option('--no-reverse', action='store_false', dest='get_ptr',
default=True, help="Do not show reverse lookup of IPs in Reports. "
"Only does look up on report results, not every ip.")
cmd_parser.add_option('--reports', action='store', dest='reports',
default='url,ip-agent,ip-agent-hour,404,response,ip,ip-rate-limit',
help="A comma separated (no spaces) list of reports to run. "
"The default is is too run all reports: "
'"url,ip-agent,ip-agent-hour,404,response,ip,ip-rate-limit')
cmd_parser.add_option('--no-parse', action='store_false', dest='parse',
default=True, help="Skip parsing and only run reports. "
"Intended use is that this was run previously with --no-drop. "
"Date range and site (unless you specify the same again) "
"will not be accurate. Also, you probably want to include --no-drop again.")
cmd_parser.add_option_group(db_group)
options, arguments = cmd_parser.parse_args()
#Some Sanity Checks on the Command Line Arguments
if options.use_mysql and options.sqlite_file:
print "Please chose either sqlite or mysql, not both."
exit(1)
if not (options.use_mysql or options.sqlite_file):
print "Please chose or sqlite or mysql"
exit(1)
#Conditional Includes
if options.html_report:
import HTML
if options.get_ptr:
import socket
#Parse and Insert Functions
def parse_log(filename, log_array, error_array, use_sql, dates):
'''Phase 1 of parsing, uses the groups from the regular expresion
to create a associate array where groupname:string. Each line is
then passed to pythonify_log. Also filter based on site and get
min and max date.'''
global parsed
file_to_parse = open(filename, 'r')
inserts = []
'''Building a group of inserts and batch executing is quite a bit
faster, see: http://stackoverflow.com/questions/2881890/'''
insert_every = 1000
for line in file_to_parse:
m = line_regex.match(line)
if m:
fields = m.groupdict()
#Select any one or all sites
if options.sitecontains != 'disabled':
if fields['site'].rfind(options.sitecontains) == -1:
continue
elif options.site != 'all':
if options.site != fields['site']:
continue
if fields['user_agent'].rfind('Pingdom.com_bot_version') != -1:
continue
#Set Min/Max Dates
pythonified = pythoninfy_log(fields)
if pythonified['time'] < dates[0]:
dates[0] = pythonified['time']
if pythonified['time'] > dates[1]:
dates[1] = pythonified['time']
if use_sql: #This uses Globals, Ick :-/
inserts.append(pythonified)
if (parsed % insert_every) == 0:
connection.execute(log_table.insert(), inserts)
inserts = []
parsed += 1
else:
log_array.append(pythonified)
parsed += 1
else:
error_array.append(line)
if use_sql:
if len(inserts) > 0:
connection.execute(log_table.insert(), inserts)
month_abbreviations = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
def custom_strptime(line):
year = int(line[7:11])
month = month_abbreviations[line[3:6]]
day = int(line[0:2])
hour = int(line[12:14])
minute = int(line[15:17])
second = int(line[18:20])
return datetime(year, month, day, hour, minute, second)
def pythoninfy_log(parsed_line):
'''Phase 2 of parse, turn strings into proper objects, excluded data
that is not needed (It can always be added later). I also truncate a couple
fields'''
field_len_limit = 100
new_entry = {}
'''
new_entry['time'] = datetime.strptime(
parsed_line['day'] +
parsed_line['month'] +
parsed_line['year'] +
parsed_line['hour'] +
parsed_line['minute'] +
parsed_line['second']# +
#parsed_line['timezone']
, "%d%b%Y%H%M%S"
)
'''
#new_entry['time'] = datetime.strptime(parsed_line['datetime'], "%d/%b/%Y:%H:%M:%S")
new_entry['time'] = custom_strptime(parsed_line['datetime'])
if not parsed_line['uri']:
parsed_line['uri'] = '-'
new_entry['ip'] = parsed_line['ip']
new_entry['site'] = parsed_line['site']
new_entry['status_code'] = parsed_line['status_code']
new_entry['back_end'] = parsed_line['back_end']
new_entry['bytes_read'] = int(parsed_line['bytes_read'])
new_entry['user_agent'] = parsed_line['user_agent'][:field_len_limit]
new_entry['uri'] = parsed_line['uri'][:field_len_limit]
#new_entry['response_time'] = float(parsed_line['response_time'])
new_entry['response_time'] = float(parsed_line['tr'])
return new_entry
#Misc
def std_dev(l, avg):
temp = []
for item in l:
temp.append((item-avg) ** 2)
stddev = (sum(temp)/(len(temp)-1)) ** .5
return stddev
def lookup_ptr(ip):
try:
ptr = socket.gethostbyaddr(ip)[0]
except:
ptr = 'No Ptr Record'
return ptr
def convert_bytes(bytes):
'''Stolen from http://www.5dollarwhitebox.org/drupal/node/84'''
bytes = float(bytes)
if bytes >= 1099511627776:
terabytes = bytes / 1099511627776
size = '%.2fT' % terabytes
elif bytes >= 1073741824:
gigabytes = bytes / 1073741824
size = '%.2fG' % gigabytes
elif bytes >= 1048576:
megabytes = bytes / 1048576
size = '%iM' % megabytes
elif bytes >= 1024:
kilobytes = bytes / 1024
size = '%iK' % kilobytes
else:
size = '%ib' % bytes
return size
def print_report(title, header_list, processed_data, output_html, column_only):
'''Generic function for printing each report,
can be either text or html'''
if output_html:
print "<h2>%s</h2>" % title
print HTML.table(processed_data, header_row=header_list, style='border: 0px; border-collapse: collapse;', border='0')
else:
if not column_only:
print title
print '\t'.join(header_list)
for line in processed_data:
print '\t'.join([str(x) for x in line])
else:
for line in processed_data:
print str(line[column_only[0]])
#SQL Reports
def sql_top_url(table, nitems, output_html, sort_by, human):
if not (sort_by == 'bandwidth' or sort_by == 'hits'):
print "Invalid Sort By"
return
output_table = []
r = connection.execute('''
SELECT uri, COUNT(uri) AS hits, AVG(bytes_read) AS average,
SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY uri
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
for row in r.fetchall():
if human:
hits = locale.format("%d", int(row[1]), grouping=True)
avgband = convert_bytes(int(row[2]))
totalband = convert_bytes(int(row[3]))
output_table.append([row[0], hits, avgband, totalband])
else:
output_table.append([row[0], row[1], row[2], row[3]])
print_report("Top URL by %s" % sort_by.capitalize(),
["Url", "Hits", "Average Bytes", "Total Bytes Served"],
output_table, output_html, False)
def sql_top_agent_ip(table, nitems, output_html, sort_by, human, rev_look, ip_only):
if not (sort_by == 'bandwidth' or sort_by == 'hits'):
print "Invalid Sort By"
return
if ip_only:
ip_only = [0]
output_table = []
r = connection.execute('''
SELECT ip, user_agent, COUNT(*) AS hits,
SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY ip, user_agent
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
for row in r.fetchall():
if rev_look:
ptr = lookup_ptr(row[0])
if human:
hits = locale.format("%d", int(row[2]), grouping=True)
totalband = convert_bytes(int(row[3]))
if rev_look:
output_table.append([row[0], ptr, row[1], hits, totalband])
else:
output_table.append([row[0], row[1], hits, totalband])
else:
if rev_look:
output_table.append([row[0], ptr, row[1], row[2], row[3]])
else:
output_table.append([row[0], row[1], row[2], row[3]])
if rev_look:
print_report("Top User Agent IP Pair by %s" % sort_by.capitalize(),
["IP", "PTR Record", "User-Agent", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
else:
print_report("Top User Agent IP Pair by %s" % sort_by.capitalize(),
["IP", "User-Agent", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
def sql_top_ip(table, nitems, output_html, sort_by, human, rev_look, ip_only):
if not (sort_by == 'bandwidth' or sort_by == 'hits'):
print "Invalid Sort By"
return
if ip_only:
ip_only = [0]
output_table = []
r = connection.execute('''
SELECT ip, COUNT(*) AS hits,
SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY ip
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
for row in r.fetchall():
if rev_look:
ptr = lookup_ptr(row[0])
if human:
hits = locale.format("%d", int(row[1]), grouping=True)
totalband = convert_bytes(int(row[2]))
if rev_look:
output_table.append([row[0], ptr, hits, totalband])
else:
output_table.append([row[0], hits, totalband])
else:
if rev_look:
output_table.append([row[0], ptr, row[1], row[2]])
else:
output_table.append([row[0], row[1], row[2]])
if rev_look:
print_report("Top IP by %s" % sort_by.capitalize(),
["IP", "PTR Record", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
else:
print_report("Top IP by %s" % sort_by.capitalize(),
["IP", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
def sql_top_rate_limit(table, nitems, output_html, sort_by, human, rev_look, ip_only):
if ip_only:
ip_only = [0]
output_table = []
r = connection.execute('''
SELECT ip, COUNT(*) AS hits
FROM %s
WHERE back_end = 'go-away'
GROUP BY ip
ORDER by %s DESC
LIMIT %s''' % (table, sort_by, nitems))
for row in r.fetchall():
if rev_look:
ptr = lookup_ptr(row[0])
if human:
hits = locale.format("%d", int(row[1]), grouping=True)
if rev_look:
output_table.append([row[0], ptr, hits])
else:
output_table.append([row[0], hits])
else:
if rev_look:
output_table.append([row[0], ptr, row[1]])
else:
output_table.append([row[0], hits, row[1]])
if rev_look:
print_report("Top IP by %s on Rate Limiter" % sort_by.capitalize(),
["IP", "PTR Record", "Hits"],
output_table, output_html, ip_only)
else:
print_report("Top IP by %s on Rate Limiter" % sort_by.capitalize(),
["IP", "Hits"],
output_table, output_html, ip_only)
def sql_top_agent_ip_hour(table, nitems, output_html, sort_by, mysql, human,
rev_look, ip_only):
if not (sort_by == 'bandwidth' or sort_by == 'hits'):
print "Invalid Sort By"
return
if ip_only:
ip_only = [1]
output_table = []
if mysql:
r = connection.execute('''
SELECT HOUR(time) as hour, ip, user_agent,
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY ip, user_agent, hour
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
else:
r = connection.execute('''
SELECT strftime("%%H", time) as hour, ip, user_agent,
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY ip, user_agent, hour
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
for row in r.fetchall():
if rev_look:
ptr = lookup_ptr(row[1])
if human:
hits = locale.format("%d", int(row[3]), grouping=True)
totalband = convert_bytes(int(row[4]))
if rev_look:
output_table.append([row[0], row[1], ptr, row[2], hits, totalband])
else:
output_table.append([row[0], row[1], row[2], hits, totalband])
else:
if rev_look:
output_table.append([row[0], row[1], ptr, row[2], row[3], row[4]])
else:
output_table.append([row[0], row[1], row[2], row[3], row[4]])
if rev_look:
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(),
["Hour", "IP", "PTR", "User-Agent", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
else:
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(),
["Hour", "IP", "User-Agent", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
def sql_url_response(table, nitems, min_hits, output_html, mysql, human):
sort_by = 'avg_response'
def sql_top_agent_ip_hour(table, nitems, output_html, sort_by, mysql, human,
rev_look, ip_only):
if not (sort_by == 'bandwidth' or sort_by == 'hits'):
print "Invalid Sort By"
return
if ip_only:
ip_only = [1]
output_table = []
if mysql:
r = connection.execute('''
SELECT HOUR(time) as hour, ip, user_agent,
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY ip, user_agent, hour
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
else:
r = connection.execute('''
SELECT strftime("%%H", time) as hour, ip, user_agent,
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth
FROM %s
GROUP BY ip, user_agent, hour
ORDER BY %s DESC
LIMIT %s''' % (table, sort_by, nitems))
for row in r.fetchall():
if rev_look:
ptr = lookup_ptr(row[1])
if human:
hits = locale.format("%d", int(row[3]), grouping=True)
totalband = convert_bytes(int(row[4]))
if rev_look:
output_table.append([row[0], row[1], ptr, row[2], hits, totalband])
else:
output_table.append([row[0], row[1], row[2], hits, totalband])
else:
if rev_look:
output_table.append([row[0], row[1], ptr, row[2], row[3], row[4]])
else:
output_table.append([row[0], row[1], row[2], row[3], row[4]])
if rev_look:
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(),
["Hour", "IP", "PTR", "User-Agent", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
else:
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(),
["Hour", "IP", "User-Agent", "Hits", "Total Bytes Served"],
output_table, output_html, ip_only)
def sql_url_response(table, nitems, min_hits, output_html, mysql, human):
sort_by = 'avg_response'
output_table = []
if mysql:
r = connection.execute('''
SELECT uri, COUNT(*) AS hits,
AVG(response_time) AS avg_response,
STDDEV(response_time) AS stddev_response
FROM %s
GROUP BY uri
HAVING hits > %s
ORDER BY %s DESC
LIMIT %s'''% (table, min_hits, sort_by, nitems))
else:
#sqlite has no STDEV
r = connection.execute('''
SELECT uri, COUNT(*) AS hits,
AVG(response_time) AS avg_response
FROM %s
GROUP BY uri
HAVING hits > %s
ORDER BY %s DESC
LIMIT %s'''% (table, min_hits, sort_by, nitems))
for row in r.fetchall():
if human:
hits = locale.format("%d", int(row[1]), grouping=True)
average = locale.format("%d", int(row[2]), grouping=True)
if mysql:
if human:
stddev = locale.format("%d", int(row[3]), grouping=True)
output_table.append([row[0], hits, average, stddev])
else:
output_table.append([row[0], row[1], int(row[2]),
int(row[3])])
else:
if human:
output_table.append([row[0], hits, average])
else:
output_table.append([row[0], row[1], int(row[2])])
if mysql:
print_report("Response time of Urls with More than %s Hits" % min_hits,
["URL", "Hits", "Avg Response MS", "StdDev Response MS"],
output_table, output_html, False)
else:
print_report("Response time of Urls with More than %s Hits" % min_hits,
["URL", "Hits", "Avg Response MS (No StdDev with sqlite)"],
output_table, output_html, False)
def sql_url_by_error(table, nitems, output_html, error_code):
output_table = []
sort_by = 'hits'
r = connection.execute('''
SELECT uri, COUNT(*) AS hits FROM %s
WHERE status_code = "%s"
GROUP BY uri
ORDER BY %s DESC
LIMIT %s''' % (table, error_code, sort_by, nitems))
for row in r.fetchall():
output_table.append([row[0], row[1]])
print_report("Top URLs by Hits for HTTP status code %s" % error_code,
["URL", "Hits"],
output_table, output_html, False)
#Data Structures
if options.use_mysql or options.sqlite_file:
use_sql = True
else:
use_sql = False
parsed = 0
log_lines = []
error_lines = []
dates = [datetime.today().replace(year=2050),
datetime.today().replace(year=1970)]
#SQL Structures
metadata = schema.MetaData()
log_table = schema.Table(options.log_table, metadata,
schema.Column('id', types.Integer, primary_key=True),
schema.Column('time', types.DateTime),
schema.Column('ip', types.String(length=15)),
schema.Column('site', types.Text),
schema.Column('back_end', types.Text),
schema.Column('status_code', types.String(length=3)),
schema.Column('bytes_read', types.Integer),
schema.Column('user_agent', types.Text),
schema.Column('uri', types.Text),
schema.Column('response_time', types.Float),
)
#Connect to Database
if options.use_mysql:
engine = create_engine('mysql://%s:%s@%s/%s' %
(options.dbuser, options.dbpass, options.dbhost,
options.dbname))
if options.sqlite_file:
if options.sqlite_file == 'memory':
engine = create_engine('sqlite:///:memory:')
else:
engine = create_engine('sqlite:///%s' % options.sqlite_file)
metadata.bind = engine
connection = engine.connect()
metadata.create_all(checkfirst=True)
if options.drop_table:
log_table.drop()
connection.close()
exit()
#Parse the Log and insert lines into DB
parse_time_start=time()
if options.parse:
for logfile in arguments:
parse_log(logfile, log_lines, error_lines, use_sql, dates)
parse_time_end=time()
#Init dates for date range
start = dates[0].strftime("%B %d %Y %H:%M")
end = dates[1].strftime("%B %d %Y %H:%M")
#Init Locale for commas
locale.setlocale(locale.LC_ALL, "")
#HTML Structure Start
if options.html_report:
if options.sitecontains != 'disabled':
site_title = options.sitecontains
else:
site_title = options.site
header_title = '''
<HTML>
<HEAD>
<TITLE>Reports for %s</TITLE>
</HEAD>
<BODY>
''' % (site_title)
print header_title
print "<h1>Reports for Site(s): %s</h1>" % site_title
print "<h2>Date and Time Range: %s - %s</h2>" % (start, end)
elif not options.ip_only:
print "Reports for Site(s): %s", options.site
print "Date and Time Range: %s - %s" % (start, end)
#Sort and Print Reports
whichreports = options.reports.split(',')
sort_time_start=time()
if 'url' in whichreports:
sql_top_url(options.log_table, options.nitems, options.html_report, 'bandwidth', options.human)
sql_top_url(options.log_table, options.nitems, options.html_report, 'hits', options.human)
if 'ip-agent' in whichreports:
sql_top_agent_ip(options.log_table, options.nitems, options.html_report, 'bandwidth',
options.human, options.get_ptr, options.ip_only)
sql_top_agent_ip(options.log_table, options.nitems, options.html_report, 'hits',
options.human, options.get_ptr, options.ip_only)
if 'ip' in whichreports:
sql_top_ip(options.log_table, options.nitems, options.html_report, 'bandwidth',
options.human, options.get_ptr, options.ip_only)
sql_top_ip(options.log_table, options.nitems, options.html_report, 'hits',
options.human, options.get_ptr, options.ip_only)
if 'ip-rate-limit' in whichreports:
sql_top_rate_limit(options.log_table, options.nitems, options.html_report, 'hits',
options.human, options.get_ptr, options.ip_only)
if 'ip-agent-hour' in whichreports:
sql_top_agent_ip_hour(options.log_table, options.nitems, options.html_report, 'bandwidth',
options.use_mysql, options.human,options.get_ptr,
options.ip_only)
sql_top_agent_ip_hour(options.log_table, options.nitems, options.html_report, 'hits',
options.use_mysql, options.human, options.get_ptr,
options.ip_only)
if '404' in whichreports:
sql_url_by_error(options.log_table, options.nitems, options.html_report, '404')
if 'response' in whichreports:
sql_url_response(options.log_table, options.nitems, 400, options.html_report,
options.use_mysql, options.human)
sort_time_end=time()
#Clean up DB, drop the table and close the connection
if options.drop:
log_table.drop()
connection.close()
#Print Some Stats
stats = [ 'Lines Parsed: %s' % parsed,
'Parse Errors: %s' % len(error_lines),
'Parse and Insert Time: %.2f' % (parse_time_end - parse_time_start),
'Report Time (Includes Reverse DNS lookups): %.2f' % (sort_time_end - sort_time_start)
]
if options.html_report:
print "<hr>"
print "<h2>Statistics</h2>"
print '<br>'.join(stats)
elif not options.ip_only:
print "Statistics"
for line in stats:
print line
if (len(error_lines) > 0) and options.errors:
print "Errors"
for line in error_lines[:10]:
print line
if options.html_report: print "</BODY></HTML>"
#!/bin/bash
reportScript="/root/sysadmin/ny-back01/scripts/logreport/haproxyLogReport.py"
universalOptions="-i 30 --use-mysql --db-name log_reports --db-user log_reports -p PASSWORD --no-drop"
date=$(date "+%Y_%m_%d" --date yesterday)
logDir="/var/log/web"
soLog='stackoverflow.log.1'
restLog='catchall.log.1'
emailHeader='/root/sysadmin/ny-back01/scripts/logreport/email_header'
otherSites="serverfault.com superuser.com"
#test='test3_'
test=''
emails='[email protected], [email protected]'
#emails='[email protected]'
(
echo "To: [email protected]
Cc: $emails
Subject: Stackoverflow.com Log Report for $date
Content-Type: text/html; charset='us-ascii'"
#cat $emailHeader
python $reportScript $universalOptions --db-table "${test}stackoverflow_${date}" ${logDir}/$soLog --site stackoverflow.com
) | sendmail -t
for site in $otherSites; do
(
echo "To: [email protected]
Cc: $emails
Subject: $site Log Report for $date
Content-Type: text/html; charset='us-ascii'"
python $reportScript $universalOptions --db-table "${test}${site%%.com}_${date}" ${logDir}/$restLog --site $site
) | sendmail -t
done
(
echo "To: [email protected]
Cc: $emails
Subject: API Log Report for $date
Content-Type: text/html; charset='us-ascii'"
python $reportScript $universalOptions --db-table "${test}api_${date}" ${logDir}/$restLog --site-contains api
) | sendmail -t
To: [email protected]
Cc: [email protected]
Subject: LogParser Report
Content-Type: text/html; charset="us-ascii"
#!/usr/bin/python
#KMB March 11, 2011.
#Python script to be used as a filter to get various stats from SO's haproxy logs. For example:
#tail -f /var/log/web/stackoverflow.log | python logFilter.py -f HTTP_HAPROXY_TR -t stats
import re, sys, signal, optparse
from operator import itemgetter
from numpy import mean, amin, amax, std
from time import time
cmd_parser = optparse.OptionParser()
cmd_parser.add_option('--field', '-f', help='Log Field to Operate on. Default is HTTP_HAPROXY_TR. Can also be a comma sep list of fields when using the top type', dest='field', default='HTTP_HAPROXY_TR')
cmd_parser.add_option('--print-fields', help='Print out the various fields that the regex parses', dest='print_fields', default=False, action='store_true')
cmd_parser.add_option('--limit-host', '-l', help='Only operate on values for the specified host. Default is no limiting', dest='limit_host', default=False)
cmd_parser.add_option('--type', '-t', help='Type of information to show. Can be one of the following: top,stats. Default is top', dest='info_type', default='top')
#cmd_parser.add_option('--n-lines', '-n', help='Print out results for every n lines, default is 1000', dest='n_lines', default='1000')
cmd_parser.add_option('--top-x', '-x', help='When running the "top" type show the top X resuls each time. Default: 10', dest='top_x', default='10')
cmd_parser.add_option('--seconds', '-s', help='Print out every X seconds. Wit this you are showing the data for the last X seconds as well. Default is 10', dest='x_seconds', default=10)
options, arguments = cmd_parser.parse_args()
line_regex=re.compile('ny-lb[0-9]{2} (?P<HTTP_CLIENT_IP>[^ ]+) - - \[[^\]]*\] "(?P<HTTP_METHOD>[^ ]+) (?P<HTTP_URI>[^ ]+) (?P<HTTP_VERSION>\w+/\d+\.\d+)" (?P<HTTP_RESPONSE_CODE>\d{3}) (?P<HTTP_HAPROXY_BYTES_READ>\d+) "[^"]*" "[^"]*" [^ ]+ [^ ]+ "(?P<HTTP_HAPROXY_FRONTEND>[^"]+)" "(?P<HTTP_HAPROXY_BACKEND>[^"]+)" "(?P<HTTP_SERVER>[^"]+)" (?P<HTTP_HAPROXY_TQ>-?\d*) (?P<HTTP_HAPROXY_TW>-?\d*) (?P<HTTP_HAPROXY_TC>-?\d*) (?P<HTTP_HAPROXY_TR>-?\d*) (?P<HTTP_HAPROXY_TT>-?\d*) "(?P<HTTP_HAPROXY_TERM_STATE>[^"]*)" (?P<HTTP_HAPROXY_ACTCONN>-?\d*) (?P<HTTP_HAPROXY_FECONN>-?\d*) (?P<HTTP_HAPROXY_BECONN>-?\d*) (?P<HTTP_HAPROXY_SRV_CONN>-?\d*) (?P<HTTP_HAPROXY_RETRIES>-?\d*) (?P<HTTP_HAPROXY_SRV_QUEUE>-?\d*) (?P<HTTP_HAPROXY_BACKEND_QUEUE>-?\d*) "[^"]*" "[^"]*" "(?P<HTTP_REFERER>[^"]*)" "(?P<HTTP_UA>[^"]*)" "(?P<HTTP_HOST>[^"]*)" "(?P<HTTP_X_FORWARD_FOR>[^"]*)"')
#Functions
def top_value(l, value, count):
'''top [count] of occurrences of [value] in list [l]'''
value_count = {}
for line in l:
v = line[value]
if v in value_count:
value_count[v] += 1
else:
value_count[v] = 1
return sorted(value_count.iteritems(), key=itemgetter(1), reverse=True)[:count]
def stats(l, value):
'''stats of an integer field'''
m = []
for line in l:
if line[value].isdigit():
m.append(int(line[value]))
return "Mean: %s Min: %s Max: %s StdDev: %s" % (mean(m), amin(m), amax(m), std(m))
def top_value_multiple(l, value_l, count):
'''Top [count] of occurrences of list of values in list [l]'''
value_count = {}
for line in l:
#Build list into tuple since tuple can become a dict key
temp_l = []
for value in value_l:
temp_l.append(line[value])
tuple_key = tuple(temp_l)
if tuple_key in value_count:
value_count[tuple_key] += 1
else:
value_count[tuple_key] = 1
return sorted(value_count.iteritems(), key=itemgetter(1), reverse=True)[:count]
if options.print_fields:
print '''
HTTP_CLIENT_IP
HTTP_METHOD
HTTP_URI
HTTP_VERSION
HTTP_RESPONSE_CODE
HTTP_HAPROXY_BYTES_READ
HTTP_HAPROXY_FRONTEND
HTTP_HAPROXY_BACKEND
HTTP_SERVER
HTTP_HAPROXY_TQ
HTTP_HAPROXY_TW
HTTP_HAPROXY_TC
HTTP_HAPROXY_TR
HTTP_HAPROXY_TT
HTTP_HAPROXY_TERM_STATE
HTTP_HAPROXY_ACTCONN
HTTP_HAPROXY_FECONN
HTTP_HAPROXY_BECONN
HTTP_HAPROXY_SRV_CONN
HTTP_HAPROXY_RETRIES
HTTP_HAPROXY_SRV_QUEUE
HTTP_HAPROXY_BACKEND_QUEUE
HTTP_REFERER
HTTP_UA
HTTP_HOST
HTTP_X_FORWARD_FOR'''
exit()
#Core Loop
#chunk_size=int(options.n_lines)
if options.info_type == 'top':
print "count %s" % ' '.join(options.field.split(','))
while True:
line_c = 0
chunk_array = []
starttime = time()
while True:
line = sys.stdin.readline()
line_c +=1
m = line_regex.match(line)
if m:
dict_fields = m.groupdict()
if options.limit_host:
if dict_fields['HTTP_HOST'] != options.limit_host:
continue
chunk_array.append(dict_fields)
#if line_c >= chunk_size:
if time() - starttime > int(options.x_seconds):
if options.info_type == 'top':
print '-' * 80
#print top_value(chunk_array, options.field, 10)
for item in top_value_multiple(chunk_array, options.field.split(','), int(options.top_x)):
#Not really sure why I end up with nest tuples, but whatever.
print item[-1], ' '.join(item[:-1][0])
if options.info_type == 'stats':
print stats(chunk_array, options.field)
break
import re, sys, signal, optparse, os, tempfile
from operator import itemgetter
from datetime import datetime
from sqlalchemy import *
import pyodbc
import atexit
cmd_parser = optparse.OptionParser()
cmd_parser.add_option('--field', '-f', help='Log Field to Operate on. Default is HTTP_HAPROXY_TR. Can also be a comma sep list of fields when using the top type', dest='field', default='HTTP_HAPROXY_TR')
options, arguments = cmd_parser.parse_args()
line_regex=re.compile('ny-lb[0-9]{2} (?P<ClientIp>[^ ]+) - - \[(?P<Time>[^\]]*)\] "(?P<Method>[^ ]+) (?P<Uri>[^ ]+) (?P<HttpVersion>\w+/\d+\.\d+)" (?P<ResponseCode>\d{3}) (?P<Bytes>\d+) "[^"]*" "[^"]*" [^ ]+ [^ ]+ "(?P<FrontEnd>[^"]+)" "(?P<BackEnd>[^"]+)" "(?P<Server>[^"]+)" (?P<Tq>-?\d*) (?P<Tw>-?\d*) (?P<Tc>-?\d*) (?P<Tr>-?\d*) (?P<Tt>-?\d*) "(?P<TermState>[^"]*)" (?P<ActConn>-?\d*) (?P<FeConn>-?\d*) (?P<BeConn>-?\d*) (?P<SrvConn>-?\d*) (?P<Retries>-?\d*) (?P<SrvQueue>-?\d*) (?P<BackEndQueue>-?\d*) "[^"]*" "[^"]*" "(?P<Referer>[^"]*)" "(?P<UserAgent>[^"]*)" "(?P<Host>[^"]*)" "(?P<ForwardFor>[^"]*)"')
#Settings
fifo_filename = '/tmp/logToSql.fifo'
#Custom strptime is faster
month_abbreviations = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
def custom_strptime(line):
year = int(line[7:11])
month = month_abbreviations[line[3:6]]
day = int(line[0:2])
hour = int(line[12:14])
minute = int(line[15:17])
second = int(line[18:20])
return datetime(year, month, day, hour, minute, second)
#Set up SQL Connection
def connect():
conn_string = 'DRIVER={FreeTDS};Server=ny-db02;Database=HaproxyLog;UID=Me;PWD=APassword'
return pyodbc.connect(conn_string)
metadata = MetaData()
e = create_engine('mssql://', creator=connect)
c = e.connect()
metadata.bind = c
log_table = Table('Log', metadata, autoload=True)
#Setup Fifo
try:
os.mkfifo(fifo_filename)
except OSError, e:
print "Failed to create FIFO: %s" % e
exit(1)
reader = open(fifo_filename, 'r')
#Exit Handling
def cleanup():
reader.close()
os.remove(fifo_filename)
c.close()
atexit.register(cleanup)
#Core Loop
line_c = 0
inserts = []
insert_size = 1000
while True:
#line = sys.stdin.readline()
line = reader.readline()
line_c +=1
m = line_regex.match(line)
if m:
fields = m.groupdict()
fields['Time'] = custom_strptime(fields['Time'])
fields['TruncTime'] = datetime(fields['Time'].year, fields['Time'].month, fields['Time'].day)
inserts.append(fields)
if line_c >= insert_size:
c.execute(log_table.insert(), inserts)
line_c = 0
inserts = []
@kylebrandt
Copy link
Author

@SamSaffron: Adding wrappers too.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment