maiconbaum · December 28, 2022 22:03 · lars-fillmore · Oct 3, 2024
diff --git a/aws_athena_query.sql b/aws_athena_query.sql
 CREATE EXTERNAL TABLE IF NOT EXISTS aws_vpc_flow_logs (
  `version` int, 
  `account_id` string, 
  `interface_id` string, 
  `srcaddr` string, 
  `dstaddr` string, 
  `srcport` int, 
  `dstport` int, 
  `protocol` bigint, 
  `packets` bigint, 
  `bytes` bigint, 
  `start` bigint, 
  `end` bigint, 
  `action` string, 
  `log_status` string, 
  `vpc_id` string, 
  `subnet_id` string, 
  `instance_id` string, 
  `tcp_flags` int, 
  `type` string, 
  `pkt_srcaddr` string, 
  `pkt_dstaddr` string, 
  `az_id` string, 
  `sublocation_type` string, 
  `sublocation_id` string, 
  `pkt_src_aws_service` string, 
  `pkt_dst_aws_service` string, 
  `flow_direction` string, 
  `traffic_path` int
 )
 PARTITIONED BY (
  `region` string,
  `year` string,
  `month` int,
  `day` int,
  `hour` int
 )
 ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
 STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
 OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
 LOCATION
  's3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/'
 TBLPROPERTIES (
  "skip.header.line.count"="1",
  "projection.enabled" = "true",
  "projection.region.type" = "enum",
  "projection.region.values" = "sa-east-1,us-east-1",
  "projection.year.type" = "date",
  "projection.year.format" = "yyyy",
  "projection.year.range" = "2022,NOW",
  "projection.year.interval" = "1",
  "projection.year.unit" = "YEARS",  
  "projection.month.type" = "integer",
  "projection.month.range" = "01,12",
  "projection.month.digits" = "2",
  "projection.day.type" = "integer",
  "projection.day.range" = "01,31",
  "projection.day.digits" = "2",
  "projection.hour.type" = "integer",
  "projection.hour.range" = "00,23",
  "projection.hour.digits" = "2",
  "storage.location.template" = "s3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/aws-region=${region}/year=${year}/month=${month}/day=${day}/hour=${hour}/"
 )
	CREATE EXTERNAL TABLE IF NOT EXISTS aws_vpc_flow_logs (
	`version` int,
	`account_id` string,
	`interface_id` string,
	`srcaddr` string,
	`dstaddr` string,
	`srcport` int,
	`dstport` int,
	`protocol` bigint,
	`packets` bigint,
	`bytes` bigint,
	`start` bigint,
	`end` bigint,
	`action` string,
	`log_status` string,
	`vpc_id` string,
	`subnet_id` string,
	`instance_id` string,
	`tcp_flags` int,
	`type` string,
	`pkt_srcaddr` string,
	`pkt_dstaddr` string,
	`az_id` string,
	`sublocation_type` string,
	`sublocation_id` string,
	`pkt_src_aws_service` string,
	`pkt_dst_aws_service` string,
	`flow_direction` string,
	`traffic_path` int
	)
	PARTITIONED BY (
	`region` string,
	`year` string,
	`month` int,
	`day` int,
	`hour` int
	)
	ROW FORMAT SERDE
	'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
	STORED AS INPUTFORMAT
	'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
	OUTPUTFORMAT
	'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
	LOCATION
	's3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/'
	TBLPROPERTIES (
	"skip.header.line.count"="1",
	"projection.enabled" = "true",
	"projection.region.type" = "enum",
	"projection.region.values" = "sa-east-1,us-east-1",
	"projection.year.type" = "date",
	"projection.year.format" = "yyyy",
	"projection.year.range" = "2022,NOW",
	"projection.year.interval" = "1",
	"projection.year.unit" = "YEARS",
	"projection.month.type" = "integer",
	"projection.month.range" = "01,12",
	"projection.month.digits" = "2",
	"projection.day.type" = "integer",
	"projection.day.range" = "01,31",
	"projection.day.digits" = "2",
	"projection.hour.type" = "integer",
	"projection.hour.range" = "00,23",
	"projection.hour.digits" = "2",
	"storage.location.template" = "s3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/aws-region=${region}/year=${year}/month=${month}/day=${day}/hour=${hour}/"
	)