Created
December 20, 2016 19:56
-
-
Save duckworth/adc47d00bcfcbcafc616ca51b5946bbc to your computer and use it in GitHub Desktop.
AWS ALB Logs Athena RegexSerDe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE IF NOT EXISTS alb_logs ( | |
type string, | |
timestamp string, | |
elb string, | |
client_ip string, | |
client_port int, | |
target_ip string, | |
target_port int, | |
request_processing_time double, | |
target_processing_time double, | |
response_processing_time double, | |
elb_status_code string, | |
target_status_code string, | |
received_bytes bigint, | |
sent_bytes bigint, | |
request_verb string, | |
url string, | |
protocol string, | |
user_agent string, | |
ssl_cipher string, | |
ssl_protocol string, | |
target_group_arn string, | |
trace_id string ) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' | |
WITH SERDEPROPERTIES ( | |
'serialization.format' = '1','input.regex' = '([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*):([0-9]*) ([.0-9]*) ([.0-9]*) ([.0-9]*) (-|[0-9]*) (-|[0-9]*) ([-0-9]*) ([-0-9]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (\"[^\"]*\") ([A-Z0-9-]+) ([A-Za-z0-9.-]*) ([^ ]*) ([^ ]*)$' ) | |
LOCATION 's3://somebucket/AWSLogs/'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It seems like logs have recently started ending with a trailing space which breaks most of the parsers Ive seen. That should be easy to fix. I also added a little regex and couple fields for separating the http route and query params out: https://gist.github.com/szinck/d456fbf691483ab77d2453c316db3371