statik · July 5, 2019 16:03
diff --git a/excerpt.yaml b/excerpt.yaml
  LambdaSortGDFindings:
    Type: 'AWS::Lambda::Function'
    Condition: AuditAccountPrimary # only in primary audit region, next to S3 bucket
    Properties:
      Code:
        ZipFile: |
          import boto3, csv, os, json
          s3_resource = boto3.resource('s3')
          s3_client = boto3.client('s3')
          
          def sanitize_key_names(finding):
            """AWS Athena does not allow column names to contain special characters
            other than underscores. Some GuardDuty event findings contain json dicts
            with :: in the key names. AWS Glue will crawl these findings and create
            schemas that Athena will crash on them with HIVE_METASTORE errors.

            As of July 2019, the 5 GuardDuty finding types that contain these fields are
            stealth_iamuser_cloudtrailloggingdisabled
            recon_iamuser_maliciousipcaller
            stealth_iamuser_loggingconfigurationmodified
            unauthorizedaccess_iamuser_consolelogin
            privilegeescalation_iamuser_administrativepermissions

            To view the problematic fields, you can use jq
            `cat finding.json|jq '.detail.service.action.awsApiCallAction.affectedResources'
            This will show output like:
            {
            "AWS::IAM::User": "GeneratedFindingIAMUser",
            "AWS::IAM::Role": "GeneratedFindingIAMRole"
            }

            This function recurses through a dictionary replacing any '::' in key names
            with '_'.
            """
            if type(finding) is dict:
                for key in finding.keys():
                    if type(finding[key]) is dict or type(finding[key]) is list:
                        finding[key] = sanitize_key_names(finding[key])
                    if "::" in key:
                        new_key = key.replace("::", "_")
                        finding[new_key] = finding.pop(key)
            elif type(finding) is list:
                for entry in finding:
                    entry = sanitize_key_names(entry)
            return finding

          def lambda_handler(event, context):
              print(event)
              record_count = 0
              
              for record in event['Records']:
              
                  bucket = record['s3']['bucket']['name']
                  object_key = record['s3']['object']['key']
                  partition = '/'.join(object_key.split('/')[2:-1])              
                  
                  response = s3_client.get_object(Bucket=bucket, Key=object_key)
                  findings = '['+ response['Body'].read().decode('utf-8').replace('}{','},\n{') +']'
                  
                  findings_list = json.loads(findings)
                  record_count += len(findings_list)
                  output = {}
                  
                  for item in findings_list:
                      fixed_item = sanitize_key_names(item)
                      if fixed_iitem['detail']['type'] not in output:
                          output[item['detail']['type']] = [fixed_item]
                      else:
                          output[fixed_item['detail']['type']].append(fixed_item)
                  
                  for finding_type in output:
                      print(object_key.split('/')[-1])
                      s3_path = 'raw/by_finding_type/' + '_'.join(finding_type.split('/')) +  '/' + partition  + '/'  + object_key.split('/')[-1] + '.json'
                      body = ''
                      for version in output[finding_type]:
                          body += json.dumps(version) + '\n'
                      s3_resource.Bucket(bucket).put_object(Key=s3_path, Body=body)
              
              return 'Processed: ' + str(record_count) + ' logs'

      Handler: index.lambda_handler
      Runtime: python3.6
      Description: 'Function sorts findings by type and places them into appropriate bucket'
      MemorySize: 128
      Timeout: 300
      Role:
        Fn::GetAtt:
        - LambdaSortRole
        - Arn
	LambdaSortGDFindings:
	Type: 'AWS::Lambda::Function'
	Condition: AuditAccountPrimary # only in primary audit region, next to S3 bucket
	Properties:
	Code:
	ZipFile: \|
	import boto3, csv, os, json
	s3_resource = boto3.resource('s3')
	s3_client = boto3.client('s3')

	def sanitize_key_names(finding):
	"""AWS Athena does not allow column names to contain special characters
	other than underscores. Some GuardDuty event findings contain json dicts
	with :: in the key names. AWS Glue will crawl these findings and create
	schemas that Athena will crash on them with HIVE_METASTORE errors.

	As of July 2019, the 5 GuardDuty finding types that contain these fields are
	stealth_iamuser_cloudtrailloggingdisabled
	recon_iamuser_maliciousipcaller
	stealth_iamuser_loggingconfigurationmodified
	unauthorizedaccess_iamuser_consolelogin
	privilegeescalation_iamuser_administrativepermissions

	To view the problematic fields, you can use jq
	`cat finding.json\|jq '.detail.service.action.awsApiCallAction.affectedResources'
	This will show output like:
	{
	"AWS::IAM::User": "GeneratedFindingIAMUser",
	"AWS::IAM::Role": "GeneratedFindingIAMRole"
	}

	This function recurses through a dictionary replacing any '::' in key names
	with '_'.
	"""
	if type(finding) is dict:
	for key in finding.keys():
	if type(finding[key]) is dict or type(finding[key]) is list:
	finding[key] = sanitize_key_names(finding[key])
	if "::" in key:
	new_key = key.replace("::", "_")
	finding[new_key] = finding.pop(key)
	elif type(finding) is list:
	for entry in finding:
	entry = sanitize_key_names(entry)
	return finding

	def lambda_handler(event, context):
	print(event)
	record_count = 0

	for record in event['Records']:

	bucket = record['s3']['bucket']['name']
	object_key = record['s3']['object']['key']
	partition = '/'.join(object_key.split('/')[2:-1])

	response = s3_client.get_object(Bucket=bucket, Key=object_key)
	findings = '['+ response['Body'].read().decode('utf-8').replace('}{','},\n{') +']'

	findings_list = json.loads(findings)
	record_count += len(findings_list)
	output = {}

	for item in findings_list:
	fixed_item = sanitize_key_names(item)
	if fixed_iitem['detail']['type'] not in output:
	output[item['detail']['type']] = [fixed_item]
	else:
	output[fixed_item['detail']['type']].append(fixed_item)

	for finding_type in output:
	print(object_key.split('/')[-1])
	s3_path = 'raw/by_finding_type/' + '_'.join(finding_type.split('/')) + '/' + partition + '/' + object_key.split('/')[-1] + '.json'
	body = ''
	for version in output[finding_type]:
	body += json.dumps(version) + '\n'
	s3_resource.Bucket(bucket).put_object(Key=s3_path, Body=body)

	return 'Processed: ' + str(record_count) + ' logs'

	Handler: index.lambda_handler
	Runtime: python3.6
	Description: 'Function sorts findings by type and places them into appropriate bucket'
	MemorySize: 128
	Timeout: 300
	Role:
	Fn::GetAtt:
	- LambdaSortRole
	- Arn