Created
December 28, 2022 22:03
-
-
Save maiconbaum/0a2a98901e0577262de918843fe94a3a to your computer and use it in GitHub Desktop.
AWS Glue Catalog Table for AWS VPC Flow Logs using Apache Hive Compatible Format and Partition Projection.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| CREATE EXTERNAL TABLE IF NOT EXISTS aws_vpc_flow_logs ( | |
| `version` int, | |
| `account_id` string, | |
| `interface_id` string, | |
| `srcaddr` string, | |
| `dstaddr` string, | |
| `srcport` int, | |
| `dstport` int, | |
| `protocol` bigint, | |
| `packets` bigint, | |
| `bytes` bigint, | |
| `start` bigint, | |
| `end` bigint, | |
| `action` string, | |
| `log_status` string, | |
| `vpc_id` string, | |
| `subnet_id` string, | |
| `instance_id` string, | |
| `tcp_flags` int, | |
| `type` string, | |
| `pkt_srcaddr` string, | |
| `pkt_dstaddr` string, | |
| `az_id` string, | |
| `sublocation_type` string, | |
| `sublocation_id` string, | |
| `pkt_src_aws_service` string, | |
| `pkt_dst_aws_service` string, | |
| `flow_direction` string, | |
| `traffic_path` int | |
| ) | |
| PARTITIONED BY ( | |
| `region` string, | |
| `year` string, | |
| `month` int, | |
| `day` int, | |
| `hour` int | |
| ) | |
| ROW FORMAT SERDE | |
| 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | |
| STORED AS INPUTFORMAT | |
| 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | |
| OUTPUTFORMAT | |
| 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | |
| LOCATION | |
| 's3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/' | |
| TBLPROPERTIES ( | |
| "skip.header.line.count"="1", | |
| "projection.enabled" = "true", | |
| "projection.region.type" = "enum", | |
| "projection.region.values" = "sa-east-1,us-east-1", | |
| "projection.year.type" = "date", | |
| "projection.year.format" = "yyyy", | |
| "projection.year.range" = "2022,NOW", | |
| "projection.year.interval" = "1", | |
| "projection.year.unit" = "YEARS", | |
| "projection.month.type" = "integer", | |
| "projection.month.range" = "01,12", | |
| "projection.month.digits" = "2", | |
| "projection.day.type" = "integer", | |
| "projection.day.range" = "01,31", | |
| "projection.day.digits" = "2", | |
| "projection.hour.type" = "integer", | |
| "projection.hour.range" = "00,23", | |
| "projection.hour.digits" = "2", | |
| "storage.location.template" = "s3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/aws-region=${region}/year=${year}/month=${month}/day=${day}/hour=${hour}/" | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is exactly what I was after!!! Thanks for the work @maiconbaum
Your TF write up will also come in handy @ryangraham
I was wondering why go to all the effort in using the Hive partitioning style in the first place, and if I'm making things too complicated than they need to be? What does Hive provide that the standard (seemingly) partitioning style that AWS seem to use in many of their other services doesn't?