Created
September 4, 2023 19:23
-
-
Save purcell/6e72406f77340e6d9bc451c72a316b8f to your computer and use it in GitHub Desktop.
Use DuckDB to convert a compressed web access log in Combined Log Format to Parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
COPY ( | |
WITH | |
-- Read the raw log line by line by abusing CSV parser | |
raw_log AS ( | |
FROM read_csv_auto('/tmp/log/access.log-20230904.gz', header=false, delim='\0') | |
) | |
, combined_log AS ( | |
SELECT regexp_extract(column0 | |
, '^(\S+) (\S+) (\S+) \[(.*?)\] "([A-Z]+?) (.*?) HTTP/(.*?)" (\d+) (\d+) "(.*?)" "(.*?)"$' | |
, [ 'ip', 'identity', 'userid', 'timestamp', 'method' | |
, 'request', 'proto', 'status', 'bytes', 'referrer', 'agent']) AS fields | |
FROM raw_log | |
) | |
SELECT fields.ip | |
, NULLIF(fields.identity, '-') AS identity | |
, NULLIF(fields.userid, '-') as userid | |
, strptime(fields.timestamp, '%d/%b/%Y:%H:%M:%S %z') as timestamp | |
, fields.method | |
, fields.request | |
, fields.proto::DECIMAL(3,2) as proto | |
, NULLIF(fields.status, '-')::USMALLINT AS status | |
, NULLIF(fields.bytes, '-')::UINTEGER AS bytes | |
, NULLIF(fields.referrer, '-') AS referrer | |
, fields.agent | |
FROM combined_log | |
) | |
TO '/tmp/log/access.log-20230904.gz.parquet'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment