Skip to content

Instantly share code, notes, and snippets.

@hmain
Created November 10, 2017 16:07
Show Gist options
  • Select an option

  • Save hmain/2aeab201aea1b247452e25891a93176f to your computer and use it in GitHub Desktop.

Select an option

Save hmain/2aeab201aea1b247452e25891a93176f to your computer and use it in GitHub Desktop.
Query S3 Access Logs with AWS Athena
/* Create a database in Athena */
CREATE DATABASE s3_AccessLogsDB;
/*
Create a table within the Athena database
Replace s3://MY_S3_BUCKET/ with your S3 bucket
which has the access logs you want to query
*/
CREATE EXTERNAL TABLE IF NOT EXISTS s3_AccessLogs.Accesslogs(
BucketOwner string,
Bucket string,
RequestDateTime string,
RemoteIP string,
Requester string,
RequestID string,
Operation string,
Key string,
RequestURI_operation string,
RequestURI_key string,
RequestURI_httpProtoversion string,
HTTPstatus string,
ErrorCode string,
BytesSent string,
ObjectSize string,
TotalTime string,
TurnAroundTime string,
Referrer string,
UserAgent string,
VersionId string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = '([^ ]*) ([^ ]*) \\[(.*?)\\] ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (-|[0-9]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\") ([^ ]*)$'
) LOCATION 's3://MY_S3_BUCKET/'
/*
Query for access outside of AWS and your VPC
Replace 172.31% with your VPC CIDR
*/
SELECT Key,
UserAgent,
RemoteIp,
count(*) AS cnt
FROM Accesslogs
WHERE regexp_like(RequestURI_operation, 'GET|HEAD')
AND Requester LIKE '-'
AND NOT regexp_like(UserAgent, 'Elastic|aws')
AND RemoteIp NOT LIKE '172.31%'
GROUP BY RemoteIp, Key, UserAgent, RemoteIp
ORDER BY cnt DESC LIMIT 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment