Last active
August 22, 2024 20:34
-
-
Save MarkRoddy/563b9194f5d196545eef4a59c913d8a6 to your computer and use it in GitHub Desktop.
DuckDB: Query S3 Access Logs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Usage: you'll want to search for the strings <bucket> and <prefix>, and insert the S3 bucket where your access | |
logs are being delivered. Use (or delete) <prefix> to filter to a subset of your logs. | |
*/ | |
/* | |
These commented out configuration settings you can either run yourself in the REPL and source this file using | |
`.read parse_s3_access_logs.sql`, or you can uncomment them and supply values for yourself. | |
*/ | |
-- install https; | |
-- load https; | |
-- SET s3_region='us-west-2'; | |
-- SET s3_access_key_id=''; | |
-- SET s3_secret_access_key=''; | |
WITH parsed_logs AS ( | |
SELECT | |
regexp_extract(col1, '^([0-9a-zA-Z]+)\s+([a-z0-9.\-]+)\s+\[([0-9/A-Za-z: +]+)\] ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ("[^"]*"|-) ([^ ]+) ([^ ]+) (\d+|-) (\d+|-) (\d+|-) (\d+|-) ("[^"]*"|-) ("[^"]*"|-) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+)(.*)$', | |
[ | |
'bucket_owner', 'bucket', 'timestamp', 'remote_ip', 'request', 'request_id', | |
'operation', 's3_key', 'request_uri', 'http_status', 's3_errorcode', 'bytes_sent', | |
'object_size', 'total_time', 'turn_around_time', 'referer', 'user_agent', | |
'version_id', 'host_id', 'sigver', 'cyphersuite', 'auth_type', 'host_header', | |
'tls_version', 'access_point_arn', 'acl_required', 'extra' | |
]) AS log_struct | |
FROM | |
-- Trick the CSV reader into reading as a single column | |
read_csv( | |
's3://<bucket>/<prefix>/*', | |
columns={'col1': 'VARCHAR'}, | |
-- Use a *hopefully* nonsensical deliminator, so no ',' chars screw us up | |
delim='\0' | |
) | |
) | |
SELECT | |
-- Grab everything from the struct that we want as strings, exclude stuff we'll coersce to diff types | |
log_struct.* exclude (timestamp, bytes_sent, object_size, total_time, turn_around_time), | |
strptime(log_struct.timestamp, '%d/%b/%Y:%H:%M:%S %z') AS timestamp, | |
CASE | |
WHEN log_struct.bytes_sent = '-' THEN NULL | |
ELSE CAST(log_struct.bytes_sent AS INTEGER) | |
END AS bytes_sent, | |
CASE | |
WHEN log_struct.object_size = '-' THEN NULL | |
ELSE CAST(log_struct.object_size AS INTEGER) | |
END AS object_size, | |
CASE | |
WHEN log_struct.total_time = '-' THEN NULL | |
ELSE CAST(log_struct.total_time AS INTEGER) | |
END AS total_time, | |
CASE | |
WHEN log_struct.turn_around_time = '-' THEN NULL | |
ELSE CAST(log_struct.turn_around_time AS INTEGER) | |
END AS turn_around_time | |
FROM parsed_logs; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Couldn't the integer casting be simplified with
try_cast
function here? I.e. instead of:it would look like