Created
November 10, 2017 16:07
-
-
Save hmain/2aeab201aea1b247452e25891a93176f to your computer and use it in GitHub Desktop.
Query S3 Access Logs with AWS Athena
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* Create a database in Athena */ | |
| CREATE DATABASE s3_AccessLogsDB; | |
| /* | |
| Create a table within the Athena database | |
| Replace s3://MY_S3_BUCKET/ with your S3 bucket | |
| which has the access logs you want to query | |
| */ | |
| CREATE EXTERNAL TABLE IF NOT EXISTS s3_AccessLogs.Accesslogs( | |
| BucketOwner string, | |
| Bucket string, | |
| RequestDateTime string, | |
| RemoteIP string, | |
| Requester string, | |
| RequestID string, | |
| Operation string, | |
| Key string, | |
| RequestURI_operation string, | |
| RequestURI_key string, | |
| RequestURI_httpProtoversion string, | |
| HTTPstatus string, | |
| ErrorCode string, | |
| BytesSent string, | |
| ObjectSize string, | |
| TotalTime string, | |
| TurnAroundTime string, | |
| Referrer string, | |
| UserAgent string, | |
| VersionId string) | |
| ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' | |
| WITH SERDEPROPERTIES ( | |
| 'serialization.format' = '1', | |
| 'input.regex' = '([^ ]*) ([^ ]*) \\[(.*?)\\] ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (-|[0-9]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\") ([^ ]*)$' | |
| ) LOCATION 's3://MY_S3_BUCKET/' | |
| /* | |
| Query for access outside of AWS and your VPC | |
| Replace 172.31% with your VPC CIDR | |
| */ | |
| SELECT Key, | |
| UserAgent, | |
| RemoteIp, | |
| count(*) AS cnt | |
| FROM Accesslogs | |
| WHERE regexp_like(RequestURI_operation, 'GET|HEAD') | |
| AND Requester LIKE '-' | |
| AND NOT regexp_like(UserAgent, 'Elastic|aws') | |
| AND RemoteIp NOT LIKE '172.31%' | |
| GROUP BY RemoteIp, Key, UserAgent, RemoteIp | |
| ORDER BY cnt DESC LIMIT 10 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment