-
See the docs for reference
-
Replace the values
BUCKET_NAME
,AWS_ACCOUNT_NUMBER
,AWS_REGION
,WEB_ACL_NAME
accordingly -
Replace
'projection.date.range' = '2023/01/01/00/00,NOW',
with the date your WAF logging begins (find in S3)Query
CREATE EXTERNAL TABLE `waf_logs`( `timestamp` bigint, `formatversion` int, `webaclid` string, `terminatingruleid` string, `terminatingruletype` string, `action` string, `terminatingrulematchdetails` array < struct < conditiontype: string, sensitivitylevel: string, location: string, matcheddata: array < string > > >, `httpsourcename` string, `httpsourceid` string, `rulegrouplist` array < struct < rulegroupid: string, terminatingrule: struct < ruleid: string, action: string, rulematchdetails: array < struct < conditiontype: string, sensitivitylevel: string, location: string, matcheddata: array < string > > > >, nonterminatingmatchingrules: array < struct < ruleid: string, action: string, rulematchdetails: array < struct < conditiontype: string, sensitivitylevel: string, location: string, matcheddata: array < string > > > > >, excludedrules: string > >, `ratebasedrulelist` array < struct < ratebasedruleid: string, limitkey: string, maxrateallowed: int > >, `nonterminatingmatchingrules` array < struct < ruleid: string, action: string, rulematchdetails: array < struct < conditiontype: string, sensitivitylevel: string, location: string, matcheddata: array < string > > >, captcharesponse: struct < responsecode: string, solvetimestamp: string > > >, `requestheadersinserted` array < struct < name: string, value: string > >, `responsecodesent` string, `httprequest` struct < clientip: string, country: string, headers: array < struct < name: string, value: string > >, uri: string, args: string, httpversion: string, httpmethod: string, requestid: string >, `labels` array < struct < name: string > >, `captcharesponse` struct < responsecode: string, solvetimestamp: string, failureReason: string > ) PARTITIONED BY ( `region` string, `date` string) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 's3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/WAFLogs/AWS_REGION/WEB_ACL_NAME/' TBLPROPERTIES( 'projection.enabled' = 'true', 'projection.region.type' = 'enum', 'projection.region.values' = 'AWS_REGION', 'projection.date.type' = 'date', 'projection.date.range' = '2023/01/01/00/00,NOW', 'projection.date.format' = 'yyyy/MM/dd/HH/mm', 'projection.date.interval' = '1', 'projection.date.interval.unit' = 'HOURS', 'storage.location.template' = 's3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/WAFLogs/${region}/WEB_ACL_NAME/${date}')
-
See the docs here
-
Replace the values
BUCKET_NAME
,AWS_ACCOUNT_NUMBER
,AWS_REGION
accordingly -
Replace
'projection.date.range' = '2022/01/01,NOW',
with the date your ALB logging begins (find in S3)Query
CREATE EXTERNAL TABLE IF NOT EXISTS alb_logs_projected ( type string, time string, elb string, client_ip string, client_port int, target_ip string, target_port int, request_processing_time double, target_processing_time double, response_processing_time double, elb_status_code int, target_status_code string, received_bytes bigint, sent_bytes bigint, request_verb string, request_url string, request_proto string, user_agent string, ssl_cipher string, ssl_protocol string, target_group_arn string, trace_id string, domain_name string, chosen_cert_arn string, matched_rule_priority string, request_creation_time string, actions_executed string, redirect_url string, lambda_error_reason string, target_port_list string, target_status_code_list string, classification string, classification_reason string ) PARTITIONED BY ( `date` STRING ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( 'serialization.format' = '1', 'input.regex' = '([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) (.*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^ ]*)\" \"([^\\s]+?)\" \"([^\\s]+)\" \"([^ ]*)\" \"([^ ]*)\" ?([^ ]*)?' ) LOCATION 's3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/elasticloadbalancing/AWS_REGION/' TBLPROPERTIES ( "projection.enabled" = "true", "projection.date.type" = "date", "projection.date.range" = "2022/01/01,NOW", "projection.date.format" = "yyyy/MM/dd", "projection.date.interval" = "1", "projection.date.interval.unit" = "DAYS", "storage.location.template" = "s3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/elasticloadbalancing/AWS_REGION/${date}" )
Query
# Get the IP address of a crashed task from the Stopped Tasks Widget on the Production-API-Segmented CW Dashboard
SELECT *
FROM alb_logs_projected
WHERE date = '2022/10/17'
AND target_processing_time > 10 # seconds - requests that take long to process can be problematic
AND received_bytes > 8000 # bytes- large payloads can typically cause OutOfMemory crashes on ecs tasks
-- AND request_url LIKE '%some_url'
-- AND regexp_like(request_verb, 'PUT')
AND regexp_like(target_ip, '172.31.29.76') # check ip of failed task
-- AND regexp_like(target_ip, '172.31.29.76|172.31.14.108|172.31.18.14|172.31.11.49|172.31.27.153') # check multiple ip
ORDER BY time DESC
Query
SELECT
request_verb,
request_url,
count(*) as count,
elb_status_code,
count(*)/(12 * 60) as count_per_min,
ceil(avg(received_bytes/1024)) as avg_received_kb,
ceil(avg(sent_bytes/1024)) as avg_sent_kb,
ceil(avg(target_processing_time)) as avg_target_processing_time_secs,
ceil(max(received_bytes/1024)) as max_received_kb,
ceil(max(sent_bytes/1024)) as max_sent_kb,
ceil(max(target_processing_time)) as max_target_processing_time_secs,
ceil((ceil(avg(target_processing_time)) * count(*))/3600) as total_processing_time_hrs
FROM alb_logs_projected
WHERE date like '2023/01/%'
GROUP BY 1, 2, 4
ORDER BY 3 DESC
Query
CREATE EXTERNAL TABLE cloudtrail_logs_pp(
eventVersion STRING,
userIdentity STRUCT<
type: STRING,
principalId: STRING,
arn: STRING,
accountId: STRING,
invokedBy: STRING,
accessKeyId: STRING,
userName: STRING,
sessionContext: STRUCT<
attributes: STRUCT<
mfaAuthenticated: STRING,
creationDate: STRING>,
sessionIssuer: STRUCT<
type: STRING,
principalId: STRING,
arn: STRING,
accountId: STRING,
userName: STRING>,
ec2RoleDelivery:string,
webIdFederationData:map<string,string>
>
>,
eventTime STRING,
eventSource STRING,
eventName STRING,
awsRegion STRING,
sourceIpAddress STRING,
userAgent STRING,
errorCode STRING,
errorMessage STRING,
requestparameters STRING,
responseelements STRING,
additionaleventdata STRING,
requestId STRING,
eventId STRING,
readOnly STRING,
resources ARRAY<STRUCT<
arn: STRING,
accountId: STRING,
type: STRING>>,
eventType STRING,
apiVersion STRING,
recipientAccountId STRING,
serviceEventDetails STRING,
sharedEventID STRING,
vpcendpointid STRING,
tlsDetails struct<
tlsVersion:string,
cipherSuite:string,
clientProvidedHostHeader:string>
)
PARTITIONED BY (
`account` STRING,
`region` STRING,
`timestamp` STRING)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS INPUTFORMAT 'com.amazon.emr.cloudtrail.CloudTrailInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://CLOUDTRAIL_BUCKET_NAME/AWSLogs/o-l09l6qorsp'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.account.type'='enum',
'projection.account.values'='ACCOUNT_1,ACCOUNT_2,ACCOUNT_3,ACCOUNT_4,ACCOUNT_5,ACCOUNT_6,ACCOUNT_7',
'projection.region.type'='enum',
'projection.region.values'='ca-central-1,us-east-1,us-east-2,eu-central-1',
'projection.timestamp.format'='yyyy/MM/dd',
'projection.timestamp.interval'='1',
'projection.timestamp.interval.unit'='DAYS',
'projection.timestamp.range'='2023/01/01,NOW',
'projection.timestamp.type'='date',
'storage.location.template'='s3://CLOUDTRAIL_BUCKET_NAME/AWSLogs/o-l09l6xxxx/${account}/CloudTrail/${region}/${timestamp}')
- Choose the right projection type based on the S3 location. Since I was logging cloudtrail across an organization, the s3 location turned out different than the one in the AWS docs. An Id like
o-l09l6xxxx
is added to the S3 path and I needed to acccount for that. - It's super important to note that even though SQL is case insensitive, everything in
storage.location.template
has to be in lower case, no exceptions.