Last active
August 29, 2015 14:22
-
-
Save kylebrandt/404d1b17fa68373cd2fc to your computer and use it in GitHub Desktop.
No promises...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#Parse Nginix Logs and insert results into either MySQL or SQLite | |
#Then run various reports on the data | |
#Kyle Brandt 2010 | |
import re, optparse | |
import cProfile | |
from datetime import datetime | |
from time import time | |
from sqlalchemy import create_engine | |
from sqlalchemy import schema, types | |
import locale | |
line_regex = re.compile( | |
r''' | |
(?P<lb> \S* ) | |
\s | |
(?P<ip> (\d{1,3}\.){3}\d{1,3} ) | |
\s-\s-\s\[ | |
(?P<datetime> \S* ) | |
\s | |
(?P<timezone> .?\d{4} ) | |
\] | |
\s | |
" | |
(?P<method> \S* ) | |
\s | |
(?P<uri> \S* ) | |
\s | |
(?P<version> HTTP/[0-9\.]* ) | |
" | |
\s | |
(?P<status_code> \d{3}) | |
\s | |
(?P<bytes_read> \d+ ) | |
\s | |
" | |
(?P<unknown_1> [^"]* ) | |
" | |
\s | |
" | |
(?P<unknown_2> [^"]* ) | |
" | |
\s | |
(?P<unknown_3> \d* ) | |
\s | |
(?P<unknown_4> \d* ) | |
\s | |
" | |
(?P<front_end> [^"]* ) | |
" | |
\s | |
" | |
(?P<back_end> [^"]* ) | |
" | |
\s | |
" | |
(?P<server> [^"]* ) | |
" | |
\s | |
(?P<tq> -?\d* ) | |
\s | |
(?P<tw> -?\d* ) | |
\s | |
(?P<tc> -?\d* ) | |
\s | |
(?P<tr> -?\d* ) | |
\s | |
(?P<tt> -?\d* ) | |
\s | |
" | |
(?P<term_state> [^"]* ) | |
" | |
\s | |
(?P<unknown_10> \d* ) | |
\s | |
(?P<unknown_11> \d* ) | |
\s | |
(?P<unknown_12> \d* ) | |
\s | |
(?P<unknown_13> \d* ) | |
\s | |
(?P<unknown_14> \d* ) | |
\s | |
(?P<unknown_15> \d* ) | |
\s | |
(?P<unknown_16> \d* ) | |
\s | |
" | |
(?P<unknown_17> [^"]* ) | |
" | |
\s | |
" | |
(?P<unknown_18> [^"]* ) | |
" | |
\s | |
" | |
(?P<referer> [^"]* ) | |
" | |
\s | |
" | |
(?P<user_agent> [^"]* ) | |
" | |
\s | |
" | |
(?P<site> [^"]* ) | |
" | |
\s | |
" | |
(?P<forward_for> [^"]* ) | |
" | |
''', re.VERBOSE) | |
#Parse Command Line Options | |
usage='''usage: %prog [options]... [logfile(s)]... | |
Example: python nginxLogReport.py --use-mysql --db-name nginix_log --db-user nginx \\ | |
--db-pass superSecret sample.log > test.html''' | |
cmd_parser = optparse.OptionParser( | |
description="Command line utility that parses Nginx logs and generates some reports", | |
prog="nginxLogReport", usage=usage) | |
format_group = optparse.OptionGroup(cmd_parser, "Formatting Related Options") | |
db_group = optparse.OptionGroup(cmd_parser, "Database Related Options") | |
format_group.add_option('--no-html', action='store_false', dest='html_report', | |
default=True, help='Do not output report in HTML. ' | |
"Might want to pipe to column -t -s $'\\t'") | |
format_group.add_option('--no-units', action='store_false', dest='human', | |
default=True, help='Do not output numbers with commas and bytes units (ie 4M, 3.2K)') | |
format_group.add_option('--items-per-report', '-i', action='store', nargs=1, dest='nitems', | |
default=20, type='int', help='Max Number of lines For each report, default is 20') | |
format_group.add_option('--errors', '-e', action='store_true', dest='errors', | |
default=False, help='Print any lines that failed to parse.') | |
format_group.add_option('--ips-only', action='store_true', dest='ip_only', | |
default=False, | |
help='When using text-only output, only print IP address for reports that print IP addresses.' | |
' Only works with reports ip-agent and ip-agent-hourly.') | |
cmd_parser.add_option_group(format_group) | |
cmd_parser.add_option('--site', action='store', dest='site', default='all', | |
help='The site to report on. Default is "all", ' | |
"but it can also be any one site, for example " | |
'"serverfault.com"') | |
cmd_parser.add_option('--site-contains', action='store', dest='sitecontains', default='disabled', | |
help='substring of site(s) to report on. Default is disabled. Do not use with site option') | |
db_group.add_option('--sqlite', action='store', dest='sqlite_file',default='', | |
help="Use sqlite. Currently no working standard deviation function. " | |
"Filename is the argument, don't use any of the --db options with this. " | |
"Argument may also be 'memory', which means don't make a db file " | |
"but rather do it all in memory.") | |
db_group.add_option('--use-mysql', action='store_true', dest='use_mysql', | |
default=False, help='Use MySQL') | |
db_group.add_option('--db-user', '-u', action='store', dest='dbuser', | |
default='', help='User of the MySQL Database') | |
db_group.add_option('--db-pass', '-p', action='store', dest='dbpass', | |
default='', help='Password for the specified user of the MySQL Database') | |
db_group.add_option('--db-host', action='store', dest='dbhost', | |
default='localhost', help='Host of the MySQL Database. Default is localhost') | |
db_group.add_option('--db-name', action='store', dest='dbname', | |
default='', help='Name of the MySQL Database.') | |
db_group.add_option('--no-drop', action='store_false', dest='drop', | |
default=True, help='Do not drop the table when done') | |
db_group.add_option('--drop', action='store_true', dest='drop_table', | |
default=False, help='Only Drop the table and exit') | |
db_group.add_option('--db-table', action='store', dest='log_table', default='log_table', | |
help='Name of table for this parse') | |
format_group.add_option('--no-reverse', action='store_false', dest='get_ptr', | |
default=True, help="Do not show reverse lookup of IPs in Reports. " | |
"Only does look up on report results, not every ip.") | |
cmd_parser.add_option('--reports', action='store', dest='reports', | |
default='url,ip-agent,ip-agent-hour,404,response,ip,ip-rate-limit', | |
help="A comma separated (no spaces) list of reports to run. " | |
"The default is is too run all reports: " | |
'"url,ip-agent,ip-agent-hour,404,response,ip,ip-rate-limit') | |
cmd_parser.add_option('--no-parse', action='store_false', dest='parse', | |
default=True, help="Skip parsing and only run reports. " | |
"Intended use is that this was run previously with --no-drop. " | |
"Date range and site (unless you specify the same again) " | |
"will not be accurate. Also, you probably want to include --no-drop again.") | |
cmd_parser.add_option_group(db_group) | |
options, arguments = cmd_parser.parse_args() | |
#Some Sanity Checks on the Command Line Arguments | |
if options.use_mysql and options.sqlite_file: | |
print "Please chose either sqlite or mysql, not both." | |
exit(1) | |
if not (options.use_mysql or options.sqlite_file): | |
print "Please chose or sqlite or mysql" | |
exit(1) | |
#Conditional Includes | |
if options.html_report: | |
import HTML | |
if options.get_ptr: | |
import socket | |
#Parse and Insert Functions | |
def parse_log(filename, log_array, error_array, use_sql, dates): | |
'''Phase 1 of parsing, uses the groups from the regular expresion | |
to create a associate array where groupname:string. Each line is | |
then passed to pythonify_log. Also filter based on site and get | |
min and max date.''' | |
global parsed | |
file_to_parse = open(filename, 'r') | |
inserts = [] | |
'''Building a group of inserts and batch executing is quite a bit | |
faster, see: http://stackoverflow.com/questions/2881890/''' | |
insert_every = 1000 | |
for line in file_to_parse: | |
m = line_regex.match(line) | |
if m: | |
fields = m.groupdict() | |
#Select any one or all sites | |
if options.sitecontains != 'disabled': | |
if fields['site'].rfind(options.sitecontains) == -1: | |
continue | |
elif options.site != 'all': | |
if options.site != fields['site']: | |
continue | |
if fields['user_agent'].rfind('Pingdom.com_bot_version') != -1: | |
continue | |
#Set Min/Max Dates | |
pythonified = pythoninfy_log(fields) | |
if pythonified['time'] < dates[0]: | |
dates[0] = pythonified['time'] | |
if pythonified['time'] > dates[1]: | |
dates[1] = pythonified['time'] | |
if use_sql: #This uses Globals, Ick :-/ | |
inserts.append(pythonified) | |
if (parsed % insert_every) == 0: | |
connection.execute(log_table.insert(), inserts) | |
inserts = [] | |
parsed += 1 | |
else: | |
log_array.append(pythonified) | |
parsed += 1 | |
else: | |
error_array.append(line) | |
if use_sql: | |
if len(inserts) > 0: | |
connection.execute(log_table.insert(), inserts) | |
month_abbreviations = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, | |
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, | |
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12} | |
def custom_strptime(line): | |
year = int(line[7:11]) | |
month = month_abbreviations[line[3:6]] | |
day = int(line[0:2]) | |
hour = int(line[12:14]) | |
minute = int(line[15:17]) | |
second = int(line[18:20]) | |
return datetime(year, month, day, hour, minute, second) | |
def pythoninfy_log(parsed_line): | |
'''Phase 2 of parse, turn strings into proper objects, excluded data | |
that is not needed (It can always be added later). I also truncate a couple | |
fields''' | |
field_len_limit = 100 | |
new_entry = {} | |
''' | |
new_entry['time'] = datetime.strptime( | |
parsed_line['day'] + | |
parsed_line['month'] + | |
parsed_line['year'] + | |
parsed_line['hour'] + | |
parsed_line['minute'] + | |
parsed_line['second']# + | |
#parsed_line['timezone'] | |
, "%d%b%Y%H%M%S" | |
) | |
''' | |
#new_entry['time'] = datetime.strptime(parsed_line['datetime'], "%d/%b/%Y:%H:%M:%S") | |
new_entry['time'] = custom_strptime(parsed_line['datetime']) | |
if not parsed_line['uri']: | |
parsed_line['uri'] = '-' | |
new_entry['ip'] = parsed_line['ip'] | |
new_entry['site'] = parsed_line['site'] | |
new_entry['status_code'] = parsed_line['status_code'] | |
new_entry['back_end'] = parsed_line['back_end'] | |
new_entry['bytes_read'] = int(parsed_line['bytes_read']) | |
new_entry['user_agent'] = parsed_line['user_agent'][:field_len_limit] | |
new_entry['uri'] = parsed_line['uri'][:field_len_limit] | |
#new_entry['response_time'] = float(parsed_line['response_time']) | |
new_entry['response_time'] = float(parsed_line['tr']) | |
return new_entry | |
#Misc | |
def std_dev(l, avg): | |
temp = [] | |
for item in l: | |
temp.append((item-avg) ** 2) | |
stddev = (sum(temp)/(len(temp)-1)) ** .5 | |
return stddev | |
def lookup_ptr(ip): | |
try: | |
ptr = socket.gethostbyaddr(ip)[0] | |
except: | |
ptr = 'No Ptr Record' | |
return ptr | |
def convert_bytes(bytes): | |
'''Stolen from http://www.5dollarwhitebox.org/drupal/node/84''' | |
bytes = float(bytes) | |
if bytes >= 1099511627776: | |
terabytes = bytes / 1099511627776 | |
size = '%.2fT' % terabytes | |
elif bytes >= 1073741824: | |
gigabytes = bytes / 1073741824 | |
size = '%.2fG' % gigabytes | |
elif bytes >= 1048576: | |
megabytes = bytes / 1048576 | |
size = '%iM' % megabytes | |
elif bytes >= 1024: | |
kilobytes = bytes / 1024 | |
size = '%iK' % kilobytes | |
else: | |
size = '%ib' % bytes | |
return size | |
def print_report(title, header_list, processed_data, output_html, column_only): | |
'''Generic function for printing each report, | |
can be either text or html''' | |
if output_html: | |
print "<h2>%s</h2>" % title | |
print HTML.table(processed_data, header_row=header_list, style='border: 0px; border-collapse: collapse;', border='0') | |
else: | |
if not column_only: | |
print title | |
print '\t'.join(header_list) | |
for line in processed_data: | |
print '\t'.join([str(x) for x in line]) | |
else: | |
for line in processed_data: | |
print str(line[column_only[0]]) | |
#SQL Reports | |
def sql_top_url(table, nitems, output_html, sort_by, human): | |
if not (sort_by == 'bandwidth' or sort_by == 'hits'): | |
print "Invalid Sort By" | |
return | |
output_table = [] | |
r = connection.execute(''' | |
SELECT uri, COUNT(uri) AS hits, AVG(bytes_read) AS average, | |
SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY uri | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
for row in r.fetchall(): | |
if human: | |
hits = locale.format("%d", int(row[1]), grouping=True) | |
avgband = convert_bytes(int(row[2])) | |
totalband = convert_bytes(int(row[3])) | |
output_table.append([row[0], hits, avgband, totalband]) | |
else: | |
output_table.append([row[0], row[1], row[2], row[3]]) | |
print_report("Top URL by %s" % sort_by.capitalize(), | |
["Url", "Hits", "Average Bytes", "Total Bytes Served"], | |
output_table, output_html, False) | |
def sql_top_agent_ip(table, nitems, output_html, sort_by, human, rev_look, ip_only): | |
if not (sort_by == 'bandwidth' or sort_by == 'hits'): | |
print "Invalid Sort By" | |
return | |
if ip_only: | |
ip_only = [0] | |
output_table = [] | |
r = connection.execute(''' | |
SELECT ip, user_agent, COUNT(*) AS hits, | |
SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY ip, user_agent | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
for row in r.fetchall(): | |
if rev_look: | |
ptr = lookup_ptr(row[0]) | |
if human: | |
hits = locale.format("%d", int(row[2]), grouping=True) | |
totalband = convert_bytes(int(row[3])) | |
if rev_look: | |
output_table.append([row[0], ptr, row[1], hits, totalband]) | |
else: | |
output_table.append([row[0], row[1], hits, totalband]) | |
else: | |
if rev_look: | |
output_table.append([row[0], ptr, row[1], row[2], row[3]]) | |
else: | |
output_table.append([row[0], row[1], row[2], row[3]]) | |
if rev_look: | |
print_report("Top User Agent IP Pair by %s" % sort_by.capitalize(), | |
["IP", "PTR Record", "User-Agent", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
else: | |
print_report("Top User Agent IP Pair by %s" % sort_by.capitalize(), | |
["IP", "User-Agent", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
def sql_top_ip(table, nitems, output_html, sort_by, human, rev_look, ip_only): | |
if not (sort_by == 'bandwidth' or sort_by == 'hits'): | |
print "Invalid Sort By" | |
return | |
if ip_only: | |
ip_only = [0] | |
output_table = [] | |
r = connection.execute(''' | |
SELECT ip, COUNT(*) AS hits, | |
SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY ip | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
for row in r.fetchall(): | |
if rev_look: | |
ptr = lookup_ptr(row[0]) | |
if human: | |
hits = locale.format("%d", int(row[1]), grouping=True) | |
totalband = convert_bytes(int(row[2])) | |
if rev_look: | |
output_table.append([row[0], ptr, hits, totalband]) | |
else: | |
output_table.append([row[0], hits, totalband]) | |
else: | |
if rev_look: | |
output_table.append([row[0], ptr, row[1], row[2]]) | |
else: | |
output_table.append([row[0], row[1], row[2]]) | |
if rev_look: | |
print_report("Top IP by %s" % sort_by.capitalize(), | |
["IP", "PTR Record", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
else: | |
print_report("Top IP by %s" % sort_by.capitalize(), | |
["IP", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
def sql_top_rate_limit(table, nitems, output_html, sort_by, human, rev_look, ip_only): | |
if ip_only: | |
ip_only = [0] | |
output_table = [] | |
r = connection.execute(''' | |
SELECT ip, COUNT(*) AS hits | |
FROM %s | |
WHERE back_end = 'go-away' | |
GROUP BY ip | |
ORDER by %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
for row in r.fetchall(): | |
if rev_look: | |
ptr = lookup_ptr(row[0]) | |
if human: | |
hits = locale.format("%d", int(row[1]), grouping=True) | |
if rev_look: | |
output_table.append([row[0], ptr, hits]) | |
else: | |
output_table.append([row[0], hits]) | |
else: | |
if rev_look: | |
output_table.append([row[0], ptr, row[1]]) | |
else: | |
output_table.append([row[0], hits, row[1]]) | |
if rev_look: | |
print_report("Top IP by %s on Rate Limiter" % sort_by.capitalize(), | |
["IP", "PTR Record", "Hits"], | |
output_table, output_html, ip_only) | |
else: | |
print_report("Top IP by %s on Rate Limiter" % sort_by.capitalize(), | |
["IP", "Hits"], | |
output_table, output_html, ip_only) | |
def sql_top_agent_ip_hour(table, nitems, output_html, sort_by, mysql, human, | |
rev_look, ip_only): | |
if not (sort_by == 'bandwidth' or sort_by == 'hits'): | |
print "Invalid Sort By" | |
return | |
if ip_only: | |
ip_only = [1] | |
output_table = [] | |
if mysql: | |
r = connection.execute(''' | |
SELECT HOUR(time) as hour, ip, user_agent, | |
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY ip, user_agent, hour | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
else: | |
r = connection.execute(''' | |
SELECT strftime("%%H", time) as hour, ip, user_agent, | |
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY ip, user_agent, hour | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
for row in r.fetchall(): | |
if rev_look: | |
ptr = lookup_ptr(row[1]) | |
if human: | |
hits = locale.format("%d", int(row[3]), grouping=True) | |
totalband = convert_bytes(int(row[4])) | |
if rev_look: | |
output_table.append([row[0], row[1], ptr, row[2], hits, totalband]) | |
else: | |
output_table.append([row[0], row[1], row[2], hits, totalband]) | |
else: | |
if rev_look: | |
output_table.append([row[0], row[1], ptr, row[2], row[3], row[4]]) | |
else: | |
output_table.append([row[0], row[1], row[2], row[3], row[4]]) | |
if rev_look: | |
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(), | |
["Hour", "IP", "PTR", "User-Agent", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
else: | |
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(), | |
["Hour", "IP", "User-Agent", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
def sql_url_response(table, nitems, min_hits, output_html, mysql, human): | |
sort_by = 'avg_response' | |
def sql_top_agent_ip_hour(table, nitems, output_html, sort_by, mysql, human, | |
rev_look, ip_only): | |
if not (sort_by == 'bandwidth' or sort_by == 'hits'): | |
print "Invalid Sort By" | |
return | |
if ip_only: | |
ip_only = [1] | |
output_table = [] | |
if mysql: | |
r = connection.execute(''' | |
SELECT HOUR(time) as hour, ip, user_agent, | |
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY ip, user_agent, hour | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
else: | |
r = connection.execute(''' | |
SELECT strftime("%%H", time) as hour, ip, user_agent, | |
COUNT(*) AS hits, SUM(bytes_read) AS bandwidth | |
FROM %s | |
GROUP BY ip, user_agent, hour | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, sort_by, nitems)) | |
for row in r.fetchall(): | |
if rev_look: | |
ptr = lookup_ptr(row[1]) | |
if human: | |
hits = locale.format("%d", int(row[3]), grouping=True) | |
totalband = convert_bytes(int(row[4])) | |
if rev_look: | |
output_table.append([row[0], row[1], ptr, row[2], hits, totalband]) | |
else: | |
output_table.append([row[0], row[1], row[2], hits, totalband]) | |
else: | |
if rev_look: | |
output_table.append([row[0], row[1], ptr, row[2], row[3], row[4]]) | |
else: | |
output_table.append([row[0], row[1], row[2], row[3], row[4]]) | |
if rev_look: | |
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(), | |
["Hour", "IP", "PTR", "User-Agent", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
else: | |
print_report("Top User Agent IP Pair per Hour by %s" % sort_by.capitalize(), | |
["Hour", "IP", "User-Agent", "Hits", "Total Bytes Served"], | |
output_table, output_html, ip_only) | |
def sql_url_response(table, nitems, min_hits, output_html, mysql, human): | |
sort_by = 'avg_response' | |
output_table = [] | |
if mysql: | |
r = connection.execute(''' | |
SELECT uri, COUNT(*) AS hits, | |
AVG(response_time) AS avg_response, | |
STDDEV(response_time) AS stddev_response | |
FROM %s | |
GROUP BY uri | |
HAVING hits > %s | |
ORDER BY %s DESC | |
LIMIT %s'''% (table, min_hits, sort_by, nitems)) | |
else: | |
#sqlite has no STDEV | |
r = connection.execute(''' | |
SELECT uri, COUNT(*) AS hits, | |
AVG(response_time) AS avg_response | |
FROM %s | |
GROUP BY uri | |
HAVING hits > %s | |
ORDER BY %s DESC | |
LIMIT %s'''% (table, min_hits, sort_by, nitems)) | |
for row in r.fetchall(): | |
if human: | |
hits = locale.format("%d", int(row[1]), grouping=True) | |
average = locale.format("%d", int(row[2]), grouping=True) | |
if mysql: | |
if human: | |
stddev = locale.format("%d", int(row[3]), grouping=True) | |
output_table.append([row[0], hits, average, stddev]) | |
else: | |
output_table.append([row[0], row[1], int(row[2]), | |
int(row[3])]) | |
else: | |
if human: | |
output_table.append([row[0], hits, average]) | |
else: | |
output_table.append([row[0], row[1], int(row[2])]) | |
if mysql: | |
print_report("Response time of Urls with More than %s Hits" % min_hits, | |
["URL", "Hits", "Avg Response MS", "StdDev Response MS"], | |
output_table, output_html, False) | |
else: | |
print_report("Response time of Urls with More than %s Hits" % min_hits, | |
["URL", "Hits", "Avg Response MS (No StdDev with sqlite)"], | |
output_table, output_html, False) | |
def sql_url_by_error(table, nitems, output_html, error_code): | |
output_table = [] | |
sort_by = 'hits' | |
r = connection.execute(''' | |
SELECT uri, COUNT(*) AS hits FROM %s | |
WHERE status_code = "%s" | |
GROUP BY uri | |
ORDER BY %s DESC | |
LIMIT %s''' % (table, error_code, sort_by, nitems)) | |
for row in r.fetchall(): | |
output_table.append([row[0], row[1]]) | |
print_report("Top URLs by Hits for HTTP status code %s" % error_code, | |
["URL", "Hits"], | |
output_table, output_html, False) | |
#Data Structures | |
if options.use_mysql or options.sqlite_file: | |
use_sql = True | |
else: | |
use_sql = False | |
parsed = 0 | |
log_lines = [] | |
error_lines = [] | |
dates = [datetime.today().replace(year=2050), | |
datetime.today().replace(year=1970)] | |
#SQL Structures | |
metadata = schema.MetaData() | |
log_table = schema.Table(options.log_table, metadata, | |
schema.Column('id', types.Integer, primary_key=True), | |
schema.Column('time', types.DateTime), | |
schema.Column('ip', types.String(length=15)), | |
schema.Column('site', types.Text), | |
schema.Column('back_end', types.Text), | |
schema.Column('status_code', types.String(length=3)), | |
schema.Column('bytes_read', types.Integer), | |
schema.Column('user_agent', types.Text), | |
schema.Column('uri', types.Text), | |
schema.Column('response_time', types.Float), | |
) | |
#Connect to Database | |
if options.use_mysql: | |
engine = create_engine('mysql://%s:%s@%s/%s' % | |
(options.dbuser, options.dbpass, options.dbhost, | |
options.dbname)) | |
if options.sqlite_file: | |
if options.sqlite_file == 'memory': | |
engine = create_engine('sqlite:///:memory:') | |
else: | |
engine = create_engine('sqlite:///%s' % options.sqlite_file) | |
metadata.bind = engine | |
connection = engine.connect() | |
metadata.create_all(checkfirst=True) | |
if options.drop_table: | |
log_table.drop() | |
connection.close() | |
exit() | |
#Parse the Log and insert lines into DB | |
parse_time_start=time() | |
if options.parse: | |
for logfile in arguments: | |
parse_log(logfile, log_lines, error_lines, use_sql, dates) | |
parse_time_end=time() | |
#Init dates for date range | |
start = dates[0].strftime("%B %d %Y %H:%M") | |
end = dates[1].strftime("%B %d %Y %H:%M") | |
#Init Locale for commas | |
locale.setlocale(locale.LC_ALL, "") | |
#HTML Structure Start | |
if options.html_report: | |
if options.sitecontains != 'disabled': | |
site_title = options.sitecontains | |
else: | |
site_title = options.site | |
header_title = ''' | |
<HTML> | |
<HEAD> | |
<TITLE>Reports for %s</TITLE> | |
</HEAD> | |
<BODY> | |
''' % (site_title) | |
print header_title | |
print "<h1>Reports for Site(s): %s</h1>" % site_title | |
print "<h2>Date and Time Range: %s - %s</h2>" % (start, end) | |
elif not options.ip_only: | |
print "Reports for Site(s): %s", options.site | |
print "Date and Time Range: %s - %s" % (start, end) | |
#Sort and Print Reports | |
whichreports = options.reports.split(',') | |
sort_time_start=time() | |
if 'url' in whichreports: | |
sql_top_url(options.log_table, options.nitems, options.html_report, 'bandwidth', options.human) | |
sql_top_url(options.log_table, options.nitems, options.html_report, 'hits', options.human) | |
if 'ip-agent' in whichreports: | |
sql_top_agent_ip(options.log_table, options.nitems, options.html_report, 'bandwidth', | |
options.human, options.get_ptr, options.ip_only) | |
sql_top_agent_ip(options.log_table, options.nitems, options.html_report, 'hits', | |
options.human, options.get_ptr, options.ip_only) | |
if 'ip' in whichreports: | |
sql_top_ip(options.log_table, options.nitems, options.html_report, 'bandwidth', | |
options.human, options.get_ptr, options.ip_only) | |
sql_top_ip(options.log_table, options.nitems, options.html_report, 'hits', | |
options.human, options.get_ptr, options.ip_only) | |
if 'ip-rate-limit' in whichreports: | |
sql_top_rate_limit(options.log_table, options.nitems, options.html_report, 'hits', | |
options.human, options.get_ptr, options.ip_only) | |
if 'ip-agent-hour' in whichreports: | |
sql_top_agent_ip_hour(options.log_table, options.nitems, options.html_report, 'bandwidth', | |
options.use_mysql, options.human,options.get_ptr, | |
options.ip_only) | |
sql_top_agent_ip_hour(options.log_table, options.nitems, options.html_report, 'hits', | |
options.use_mysql, options.human, options.get_ptr, | |
options.ip_only) | |
if '404' in whichreports: | |
sql_url_by_error(options.log_table, options.nitems, options.html_report, '404') | |
if 'response' in whichreports: | |
sql_url_response(options.log_table, options.nitems, 400, options.html_report, | |
options.use_mysql, options.human) | |
sort_time_end=time() | |
#Clean up DB, drop the table and close the connection | |
if options.drop: | |
log_table.drop() | |
connection.close() | |
#Print Some Stats | |
stats = [ 'Lines Parsed: %s' % parsed, | |
'Parse Errors: %s' % len(error_lines), | |
'Parse and Insert Time: %.2f' % (parse_time_end - parse_time_start), | |
'Report Time (Includes Reverse DNS lookups): %.2f' % (sort_time_end - sort_time_start) | |
] | |
if options.html_report: | |
print "<hr>" | |
print "<h2>Statistics</h2>" | |
print '<br>'.join(stats) | |
elif not options.ip_only: | |
print "Statistics" | |
for line in stats: | |
print line | |
if (len(error_lines) > 0) and options.errors: | |
print "Errors" | |
for line in error_lines[:10]: | |
print line | |
if options.html_report: print "</BODY></HTML>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
reportScript="/root/sysadmin/ny-back01/scripts/logreport/haproxyLogReport.py" | |
universalOptions="-i 30 --use-mysql --db-name log_reports --db-user log_reports -p PASSWORD --no-drop" | |
date=$(date "+%Y_%m_%d" --date yesterday) | |
logDir="/var/log/web" | |
soLog='stackoverflow.log.1' | |
restLog='catchall.log.1' | |
emailHeader='/root/sysadmin/ny-back01/scripts/logreport/email_header' | |
otherSites="serverfault.com superuser.com" | |
#test='test3_' | |
test='' | |
emails='[email protected], [email protected]' | |
#emails='[email protected]' | |
( | |
echo "To: [email protected] | |
Cc: $emails | |
Subject: Stackoverflow.com Log Report for $date | |
Content-Type: text/html; charset='us-ascii'" | |
#cat $emailHeader | |
python $reportScript $universalOptions --db-table "${test}stackoverflow_${date}" ${logDir}/$soLog --site stackoverflow.com | |
) | sendmail -t | |
for site in $otherSites; do | |
( | |
echo "To: [email protected] | |
Cc: $emails | |
Subject: $site Log Report for $date | |
Content-Type: text/html; charset='us-ascii'" | |
python $reportScript $universalOptions --db-table "${test}${site%%.com}_${date}" ${logDir}/$restLog --site $site | |
) | sendmail -t | |
done | |
( | |
echo "To: [email protected] | |
Cc: $emails | |
Subject: API Log Report for $date | |
Content-Type: text/html; charset='us-ascii'" | |
python $reportScript $universalOptions --db-table "${test}api_${date}" ${logDir}/$restLog --site-contains api | |
) | sendmail -t |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
To: [email protected] | |
Cc: [email protected] | |
Subject: LogParser Report | |
Content-Type: text/html; charset="us-ascii" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#KMB March 11, 2011. | |
#Python script to be used as a filter to get various stats from SO's haproxy logs. For example: | |
#tail -f /var/log/web/stackoverflow.log | python logFilter.py -f HTTP_HAPROXY_TR -t stats | |
import re, sys, signal, optparse | |
from operator import itemgetter | |
from numpy import mean, amin, amax, std | |
from time import time | |
cmd_parser = optparse.OptionParser() | |
cmd_parser.add_option('--field', '-f', help='Log Field to Operate on. Default is HTTP_HAPROXY_TR. Can also be a comma sep list of fields when using the top type', dest='field', default='HTTP_HAPROXY_TR') | |
cmd_parser.add_option('--print-fields', help='Print out the various fields that the regex parses', dest='print_fields', default=False, action='store_true') | |
cmd_parser.add_option('--limit-host', '-l', help='Only operate on values for the specified host. Default is no limiting', dest='limit_host', default=False) | |
cmd_parser.add_option('--type', '-t', help='Type of information to show. Can be one of the following: top,stats. Default is top', dest='info_type', default='top') | |
#cmd_parser.add_option('--n-lines', '-n', help='Print out results for every n lines, default is 1000', dest='n_lines', default='1000') | |
cmd_parser.add_option('--top-x', '-x', help='When running the "top" type show the top X resuls each time. Default: 10', dest='top_x', default='10') | |
cmd_parser.add_option('--seconds', '-s', help='Print out every X seconds. Wit this you are showing the data for the last X seconds as well. Default is 10', dest='x_seconds', default=10) | |
options, arguments = cmd_parser.parse_args() | |
line_regex=re.compile('ny-lb[0-9]{2} (?P<HTTP_CLIENT_IP>[^ ]+) - - \[[^\]]*\] "(?P<HTTP_METHOD>[^ ]+) (?P<HTTP_URI>[^ ]+) (?P<HTTP_VERSION>\w+/\d+\.\d+)" (?P<HTTP_RESPONSE_CODE>\d{3}) (?P<HTTP_HAPROXY_BYTES_READ>\d+) "[^"]*" "[^"]*" [^ ]+ [^ ]+ "(?P<HTTP_HAPROXY_FRONTEND>[^"]+)" "(?P<HTTP_HAPROXY_BACKEND>[^"]+)" "(?P<HTTP_SERVER>[^"]+)" (?P<HTTP_HAPROXY_TQ>-?\d*) (?P<HTTP_HAPROXY_TW>-?\d*) (?P<HTTP_HAPROXY_TC>-?\d*) (?P<HTTP_HAPROXY_TR>-?\d*) (?P<HTTP_HAPROXY_TT>-?\d*) "(?P<HTTP_HAPROXY_TERM_STATE>[^"]*)" (?P<HTTP_HAPROXY_ACTCONN>-?\d*) (?P<HTTP_HAPROXY_FECONN>-?\d*) (?P<HTTP_HAPROXY_BECONN>-?\d*) (?P<HTTP_HAPROXY_SRV_CONN>-?\d*) (?P<HTTP_HAPROXY_RETRIES>-?\d*) (?P<HTTP_HAPROXY_SRV_QUEUE>-?\d*) (?P<HTTP_HAPROXY_BACKEND_QUEUE>-?\d*) "[^"]*" "[^"]*" "(?P<HTTP_REFERER>[^"]*)" "(?P<HTTP_UA>[^"]*)" "(?P<HTTP_HOST>[^"]*)" "(?P<HTTP_X_FORWARD_FOR>[^"]*)"') | |
#Functions | |
def top_value(l, value, count): | |
'''top [count] of occurrences of [value] in list [l]''' | |
value_count = {} | |
for line in l: | |
v = line[value] | |
if v in value_count: | |
value_count[v] += 1 | |
else: | |
value_count[v] = 1 | |
return sorted(value_count.iteritems(), key=itemgetter(1), reverse=True)[:count] | |
def stats(l, value): | |
'''stats of an integer field''' | |
m = [] | |
for line in l: | |
if line[value].isdigit(): | |
m.append(int(line[value])) | |
return "Mean: %s Min: %s Max: %s StdDev: %s" % (mean(m), amin(m), amax(m), std(m)) | |
def top_value_multiple(l, value_l, count): | |
'''Top [count] of occurrences of list of values in list [l]''' | |
value_count = {} | |
for line in l: | |
#Build list into tuple since tuple can become a dict key | |
temp_l = [] | |
for value in value_l: | |
temp_l.append(line[value]) | |
tuple_key = tuple(temp_l) | |
if tuple_key in value_count: | |
value_count[tuple_key] += 1 | |
else: | |
value_count[tuple_key] = 1 | |
return sorted(value_count.iteritems(), key=itemgetter(1), reverse=True)[:count] | |
if options.print_fields: | |
print ''' | |
HTTP_CLIENT_IP | |
HTTP_METHOD | |
HTTP_URI | |
HTTP_VERSION | |
HTTP_RESPONSE_CODE | |
HTTP_HAPROXY_BYTES_READ | |
HTTP_HAPROXY_FRONTEND | |
HTTP_HAPROXY_BACKEND | |
HTTP_SERVER | |
HTTP_HAPROXY_TQ | |
HTTP_HAPROXY_TW | |
HTTP_HAPROXY_TC | |
HTTP_HAPROXY_TR | |
HTTP_HAPROXY_TT | |
HTTP_HAPROXY_TERM_STATE | |
HTTP_HAPROXY_ACTCONN | |
HTTP_HAPROXY_FECONN | |
HTTP_HAPROXY_BECONN | |
HTTP_HAPROXY_SRV_CONN | |
HTTP_HAPROXY_RETRIES | |
HTTP_HAPROXY_SRV_QUEUE | |
HTTP_HAPROXY_BACKEND_QUEUE | |
HTTP_REFERER | |
HTTP_UA | |
HTTP_HOST | |
HTTP_X_FORWARD_FOR''' | |
exit() | |
#Core Loop | |
#chunk_size=int(options.n_lines) | |
if options.info_type == 'top': | |
print "count %s" % ' '.join(options.field.split(',')) | |
while True: | |
line_c = 0 | |
chunk_array = [] | |
starttime = time() | |
while True: | |
line = sys.stdin.readline() | |
line_c +=1 | |
m = line_regex.match(line) | |
if m: | |
dict_fields = m.groupdict() | |
if options.limit_host: | |
if dict_fields['HTTP_HOST'] != options.limit_host: | |
continue | |
chunk_array.append(dict_fields) | |
#if line_c >= chunk_size: | |
if time() - starttime > int(options.x_seconds): | |
if options.info_type == 'top': | |
print '-' * 80 | |
#print top_value(chunk_array, options.field, 10) | |
for item in top_value_multiple(chunk_array, options.field.split(','), int(options.top_x)): | |
#Not really sure why I end up with nest tuples, but whatever. | |
print item[-1], ' '.join(item[:-1][0]) | |
if options.info_type == 'stats': | |
print stats(chunk_array, options.field) | |
break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, sys, signal, optparse, os, tempfile | |
from operator import itemgetter | |
from datetime import datetime | |
from sqlalchemy import * | |
import pyodbc | |
import atexit | |
cmd_parser = optparse.OptionParser() | |
cmd_parser.add_option('--field', '-f', help='Log Field to Operate on. Default is HTTP_HAPROXY_TR. Can also be a comma sep list of fields when using the top type', dest='field', default='HTTP_HAPROXY_TR') | |
options, arguments = cmd_parser.parse_args() | |
line_regex=re.compile('ny-lb[0-9]{2} (?P<ClientIp>[^ ]+) - - \[(?P<Time>[^\]]*)\] "(?P<Method>[^ ]+) (?P<Uri>[^ ]+) (?P<HttpVersion>\w+/\d+\.\d+)" (?P<ResponseCode>\d{3}) (?P<Bytes>\d+) "[^"]*" "[^"]*" [^ ]+ [^ ]+ "(?P<FrontEnd>[^"]+)" "(?P<BackEnd>[^"]+)" "(?P<Server>[^"]+)" (?P<Tq>-?\d*) (?P<Tw>-?\d*) (?P<Tc>-?\d*) (?P<Tr>-?\d*) (?P<Tt>-?\d*) "(?P<TermState>[^"]*)" (?P<ActConn>-?\d*) (?P<FeConn>-?\d*) (?P<BeConn>-?\d*) (?P<SrvConn>-?\d*) (?P<Retries>-?\d*) (?P<SrvQueue>-?\d*) (?P<BackEndQueue>-?\d*) "[^"]*" "[^"]*" "(?P<Referer>[^"]*)" "(?P<UserAgent>[^"]*)" "(?P<Host>[^"]*)" "(?P<ForwardFor>[^"]*)"') | |
#Settings | |
fifo_filename = '/tmp/logToSql.fifo' | |
#Custom strptime is faster | |
month_abbreviations = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, | |
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, | |
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12} | |
def custom_strptime(line): | |
year = int(line[7:11]) | |
month = month_abbreviations[line[3:6]] | |
day = int(line[0:2]) | |
hour = int(line[12:14]) | |
minute = int(line[15:17]) | |
second = int(line[18:20]) | |
return datetime(year, month, day, hour, minute, second) | |
#Set up SQL Connection | |
def connect(): | |
conn_string = 'DRIVER={FreeTDS};Server=ny-db02;Database=HaproxyLog;UID=Me;PWD=APassword' | |
return pyodbc.connect(conn_string) | |
metadata = MetaData() | |
e = create_engine('mssql://', creator=connect) | |
c = e.connect() | |
metadata.bind = c | |
log_table = Table('Log', metadata, autoload=True) | |
#Setup Fifo | |
try: | |
os.mkfifo(fifo_filename) | |
except OSError, e: | |
print "Failed to create FIFO: %s" % e | |
exit(1) | |
reader = open(fifo_filename, 'r') | |
#Exit Handling | |
def cleanup(): | |
reader.close() | |
os.remove(fifo_filename) | |
c.close() | |
atexit.register(cleanup) | |
#Core Loop | |
line_c = 0 | |
inserts = [] | |
insert_size = 1000 | |
while True: | |
#line = sys.stdin.readline() | |
line = reader.readline() | |
line_c +=1 | |
m = line_regex.match(line) | |
if m: | |
fields = m.groupdict() | |
fields['Time'] = custom_strptime(fields['Time']) | |
fields['TruncTime'] = datetime(fields['Time'].year, fields['Time'].month, fields['Time'].day) | |
inserts.append(fields) | |
if line_c >= insert_size: | |
c.execute(log_table.insert(), inserts) | |
line_c = 0 | |
inserts = [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
new script definitely looks like it ❤️