Skip to content

Instantly share code, notes, and snippets.

@siraj
Forked from nagyv/goaccess.py
Created November 8, 2016 09:11
Show Gist options
  • Save siraj/49b6ce508ff2f861003bc6fb65d389c1 to your computer and use it in GitHub Desktop.
Save siraj/49b6ce508ff2f861003bc6fb65d389c1 to your computer and use it in GitHub Desktop.
A python script for downloading and processing Amazon S3 logs using goaccess
#!/bin/python
import os
from boto.s3.connection import S3Connection
import subprocess
from datetime import datetime, date
import argparse
import tempfile
import json
parser = argparse.ArgumentParser(description="Downloads logs from S3, and parses them with goaccess.")
parser.add_argument("aws_key", help="Amazon identification key", default=None)
parser.add_argument("aws_secret", help="Amazon identification key secret", default=None)
parser.add_argument("input_bucket", help="Input s3 path where the logs are to be found (s3://[BUCKET]/[PATH]/)")
parser.add_argument("input_path", help="Input s3 path where the logs are to be found (s3://[BUCKET]/[PATH]/)")
parser.add_argument("-v", "--verbose", help="Verbose output", action="store_true", default=False)
parser.add_argument("-d", "--date", help="The date to run the report on in YYYY-MM-DD format")
def log(*msg):
if args.verbose:
print msg
class GoAccess(object):
"""
We download the log files from S3, then concatenate them, and pass the results to goaccess. It gives back a JSON
that we can handle further.
"""
def __init__(self, input_bucket, input_path, date_filter, aws_keys=None):
"""
:param aws_keys: a list of (aws key, secret key)
"""
self.input_bucket = input_bucket
self.input_path = input_path
self.date_filter = date_filter
self.aws_keys = aws_keys
def _create_goconfig(self):
"""
Creates a temporary goaccessrc file with the necessary formatting
"""
self.configfile = tempfile.NamedTemporaryFile()
self.configfile.write("""color_scheme 0
date_format %d/%b/%Y
log_format %^ %^ [%d:%^] %h %^ %^ %^ %^ "%^ %r %^" %s %^ %b %^ %^ %^ "%^" "%u" %^
""")
self.configfile.flush()
def is_needed(self, filename):
"""
Only files that return true will be processed.
By default the file name should start with `access_log` and should contain the date filtered.
"""
return "access_log-" in filename and self.date_filter.strftime("%Y-%m-%d") in filename
def concat_files(self, outfile, filename):
with open(filename) as infile:
outfile.write(infile.read())
def download_logs(self):
"""
Downloads logs from S3 using Boto.
"""
if self.aws_keys:
conn = S3Connection(*self.aws_keys)
else:
conn = S3Connection()
mybucket = conn.get_bucket(self.input_bucket)
tempdir = tempfile.mkdtemp()
for item in mybucket.list(prefix=self.input_path):
if self.is_needed(item.key):
local_file = os.path.join(tempdir, item.key.split("/")[-1])
log("Downloading %s to %s" % (item.key, local_file))
item.get_contents_to_filename(local_file)
yield local_file
def process_results(self, json):
"""
This is the main method to be overwritten by implementors.
:param json: A JSON object result from goaccess to be processed further.
"""
log(json)
def run(self):
"""
Just do it!
"""
self._create_goconfig()
logs = self.download_logs()
with tempfile.NamedTemporaryFile() as tempLog:
for downloaded in logs:
self.concat_files(tempLog, downloaded)
log("Creating report")
tempLog.flush() # needed to have the temp file written for sure
server = subprocess.Popen(["goaccess", "-f", tempLog.name, "-o", "json", "-p", self.configfile.name], stdout=subprocess.PIPE)
out, err = server.communicate()
self.process_results(json.loads(out))
return True
if __name__ == "__main__":
args = parser.parse_args()
if args.date:
given_date = datetime.strptime(args.date, "%Y-%m-%d")
else:
given_date = date.today()
if args.aws_key and args.aws_secret:
aws_keys = (args.aws_key, args.aws_secret)
else:
aws_keys = None
processor = GoAccess(args.input_bucket, args.input_path, given_date, aws_keys)
processor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment