-
-
Save siraj/49b6ce508ff2f861003bc6fb65d389c1 to your computer and use it in GitHub Desktop.
A python script for downloading and processing Amazon S3 logs using goaccess
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python | |
import os | |
from boto.s3.connection import S3Connection | |
import subprocess | |
from datetime import datetime, date | |
import argparse | |
import tempfile | |
import json | |
parser = argparse.ArgumentParser(description="Downloads logs from S3, and parses them with goaccess.") | |
parser.add_argument("aws_key", help="Amazon identification key", default=None) | |
parser.add_argument("aws_secret", help="Amazon identification key secret", default=None) | |
parser.add_argument("input_bucket", help="Input s3 path where the logs are to be found (s3://[BUCKET]/[PATH]/)") | |
parser.add_argument("input_path", help="Input s3 path where the logs are to be found (s3://[BUCKET]/[PATH]/)") | |
parser.add_argument("-v", "--verbose", help="Verbose output", action="store_true", default=False) | |
parser.add_argument("-d", "--date", help="The date to run the report on in YYYY-MM-DD format") | |
def log(*msg): | |
if args.verbose: | |
print msg | |
class GoAccess(object): | |
""" | |
We download the log files from S3, then concatenate them, and pass the results to goaccess. It gives back a JSON | |
that we can handle further. | |
""" | |
def __init__(self, input_bucket, input_path, date_filter, aws_keys=None): | |
""" | |
:param aws_keys: a list of (aws key, secret key) | |
""" | |
self.input_bucket = input_bucket | |
self.input_path = input_path | |
self.date_filter = date_filter | |
self.aws_keys = aws_keys | |
def _create_goconfig(self): | |
""" | |
Creates a temporary goaccessrc file with the necessary formatting | |
""" | |
self.configfile = tempfile.NamedTemporaryFile() | |
self.configfile.write("""color_scheme 0 | |
date_format %d/%b/%Y | |
log_format %^ %^ [%d:%^] %h %^ %^ %^ %^ "%^ %r %^" %s %^ %b %^ %^ %^ "%^" "%u" %^ | |
""") | |
self.configfile.flush() | |
def is_needed(self, filename): | |
""" | |
Only files that return true will be processed. | |
By default the file name should start with `access_log` and should contain the date filtered. | |
""" | |
return "access_log-" in filename and self.date_filter.strftime("%Y-%m-%d") in filename | |
def concat_files(self, outfile, filename): | |
with open(filename) as infile: | |
outfile.write(infile.read()) | |
def download_logs(self): | |
""" | |
Downloads logs from S3 using Boto. | |
""" | |
if self.aws_keys: | |
conn = S3Connection(*self.aws_keys) | |
else: | |
conn = S3Connection() | |
mybucket = conn.get_bucket(self.input_bucket) | |
tempdir = tempfile.mkdtemp() | |
for item in mybucket.list(prefix=self.input_path): | |
if self.is_needed(item.key): | |
local_file = os.path.join(tempdir, item.key.split("/")[-1]) | |
log("Downloading %s to %s" % (item.key, local_file)) | |
item.get_contents_to_filename(local_file) | |
yield local_file | |
def process_results(self, json): | |
""" | |
This is the main method to be overwritten by implementors. | |
:param json: A JSON object result from goaccess to be processed further. | |
""" | |
log(json) | |
def run(self): | |
""" | |
Just do it! | |
""" | |
self._create_goconfig() | |
logs = self.download_logs() | |
with tempfile.NamedTemporaryFile() as tempLog: | |
for downloaded in logs: | |
self.concat_files(tempLog, downloaded) | |
log("Creating report") | |
tempLog.flush() # needed to have the temp file written for sure | |
server = subprocess.Popen(["goaccess", "-f", tempLog.name, "-o", "json", "-p", self.configfile.name], stdout=subprocess.PIPE) | |
out, err = server.communicate() | |
self.process_results(json.loads(out)) | |
return True | |
if __name__ == "__main__": | |
args = parser.parse_args() | |
if args.date: | |
given_date = datetime.strptime(args.date, "%Y-%m-%d") | |
else: | |
given_date = date.today() | |
if args.aws_key and args.aws_secret: | |
aws_keys = (args.aws_key, args.aws_secret) | |
else: | |
aws_keys = None | |
processor = GoAccess(args.input_bucket, args.input_path, given_date, aws_keys) | |
processor.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment