-
-
Save jweyrich/8d53a7bf5bad7b5958423cb4e538ab20 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# coding=utf8 | |
# | |
# AUTHOR: Jardel Weyrich <jweyrich at gmail dot com> | |
# | |
from __future__ import print_function | |
import re, sys | |
def parse_alb_log_file(file_path): | |
fields = [ | |
"type", | |
"timestamp", | |
"alb", | |
"client_ip", | |
"client_port", | |
"backend_ip", | |
"backend_port", | |
"request_processing_time", | |
"backend_processing_time", | |
"response_processing_time", | |
"alb_status_code", | |
"backend_status_code", | |
"received_bytes", | |
"sent_bytes", | |
"request_verb", | |
"request_url", | |
"request_proto", | |
"user_agent", | |
"ssl_cipher", | |
"ssl_protocol", | |
"target_group_arn", | |
"trace_id", | |
"domain_name", | |
"chosen_cert_arn", | |
"matched_rule_priority", | |
"request_creation_time", | |
"actions_executed", | |
"redirect_url", | |
"new_field", | |
] | |
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution. | |
# REFERENCE: https://docs.aws.amazon.com/athena/latest/ug/application-load-balancer-logs.html#create-alb-table | |
regex = r"([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) ([^ ]*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-\_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" ($|\"[^ ]*\")(.*)" | |
with open(file_path, 'r') as file: | |
for line in file: | |
matches = re.search(regex, line) | |
if matches: | |
for i, field in enumerate(fields): | |
end = ", " if i < len(fields)-1 else "\n" | |
print("%s=\"%s\"" % (field, matches.group(i+1)), end=end) | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
sys.exit("usage: %s <log_file_path>" % sys.argv[0]) | |
parse_alb_log_file(sys.argv[1]) |
@starkers you're welcome! Please, use it without restrictions from my part! The most important piece of the code is the regex IMO, and I borrowed it from the AWS docs (linked there), so you might want to read their terms - IIRC AWS is pretty open to non-commercial use of their content. For commercial use you need a written consent.
If you really want to mention it, a thank you note is more than enough.
And let us know when you publish it, so we can use it as well :-)
Regex will break for few logs where TLS cypher is logged. Update with this
regex = r"([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) ([^ ]*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-\_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" ($|\"[^ ]*\")(.*)"
for future someones, if you fail to parse logs after using TLS 1.3 you should update cipher related regex field because underscore character would be there such as TLS_AES_128_GCM_SHA256
@Shubhamnegi and @dongho-jung Thanks for the info. I did update the gist with your suggestion @Shubhamnegi
@jweyrich using this regex i have build utility which can pull logs from s3 and push parsed log to different destination such as elasticsearch or influxdb for further analysis.
Parsing can be triggered in 3 ways
- Cli option to pass logs dir in local system
- lambda handler which gets triggered on s3 notifications (link s3 notification to lambda)
- SQS consumer which consumes notification from SQS using s3 notifications (link s3 notifications to SQS)
Thanks for this, its the neatest thing I've found. I'll be open-sourcing a little tool I made using this, do you have a license in mind?