Last active
June 11, 2024 14:39
-
-
Save jweyrich/8d53a7bf5bad7b5958423cb4e538ab20 to your computer and use it in GitHub Desktop.
AWS ALB Log Parser written in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding=utf8 | |
# | |
# AUTHOR: Jardel Weyrich <jweyrich at gmail dot com> | |
# | |
from __future__ import print_function | |
import re, sys | |
def parse_alb_log_file(file_path): | |
fields = [ | |
"type", | |
"timestamp", | |
"alb", | |
"client_ip", | |
"client_port", | |
"backend_ip", | |
"backend_port", | |
"request_processing_time", | |
"backend_processing_time", | |
"response_processing_time", | |
"alb_status_code", | |
"backend_status_code", | |
"received_bytes", | |
"sent_bytes", | |
"request_verb", | |
"request_url", | |
"request_proto", | |
"user_agent", | |
"ssl_cipher", | |
"ssl_protocol", | |
"target_group_arn", | |
"trace_id", | |
"domain_name", | |
"chosen_cert_arn", | |
"matched_rule_priority", | |
"request_creation_time", | |
"actions_executed", | |
"redirect_url", | |
"new_field", | |
] | |
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution. | |
# REFERENCE: https://docs.aws.amazon.com/athena/latest/ug/application-load-balancer-logs.html#create-alb-table | |
regex = r"([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) ([^ ]*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-\_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" ($|\"[^ ]*\")(.*)" | |
with open(file_path, 'r') as file: | |
for line in file: | |
matches = re.search(regex, line) | |
if matches: | |
for i, field in enumerate(fields): | |
end = ", " if i < len(fields)-1 else "\n" | |
print("%s=\"%s\"" % (field, matches.group(i+1)), end=end) | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
sys.exit("usage: %s <log_file_path>" % sys.argv[0]) | |
parse_alb_log_file(sys.argv[1]) |
Regex will break for few logs where TLS cypher is logged. Update with this
regex = r"([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) ([^ ]*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-\_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" ($|\"[^ ]*\")(.*)"
for future someones, if you fail to parse logs after using TLS 1.3 you should update cipher related regex field because underscore character would be there such as TLS_AES_128_GCM_SHA256
@Shubhamnegi and @dongho-jung Thanks for the info. I did update the gist with your suggestion @Shubhamnegi
@jweyrich using this regex i have build utility which can pull logs from s3 and push parsed log to different destination such as elasticsearch or influxdb for further analysis.
Parsing can be triggered in 3 ways
- Cli option to pass logs dir in local system
- lambda handler which gets triggered on s3 notifications (link s3 notification to lambda)
- SQS consumer which consumes notification from SQS using s3 notifications (link s3 notifications to SQS)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@starkers you're welcome! Please, use it without restrictions from my part! The most important piece of the code is the regex IMO, and I borrowed it from the AWS docs (linked there), so you might want to read their terms - IIRC AWS is pretty open to non-commercial use of their content. For commercial use you need a written consent.
If you really want to mention it, a thank you note is more than enough.
And let us know when you publish it, so we can use it as well :-)