huynhbaoan · August 31, 2020 02:53
diff --git a/lambda_function_compress_s3.py b/lambda_function_compress_s3.py
 ###
 ### This gist contains 2 files : settings.json and lambda_function.py
 ###

 ### settings.json
 {
    "extensions" : ["*.hdr", "*.glb", "*.wasm"]
 }

 ### lambda_function.py
 '''
 This script convert an uncompressed S3 file into a gzipped compressed file. File is replaced, original file is deleted (replaced by the gz version)
 Create a role with S3 (read/write) and Cloudwatch Logs access :
 {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "VisualEditor0",
            "Effect": "Allow",
            "Action": [
                "logs:CreateLogStream",
                "s3:*",
                "logs:PutLogEvents"
            ],
            "Resource": "*"
        },
        {
            "Sid": "VisualEditor1",
            "Effect": "Allow",
            "Action": "logs:CreateLogGroup",
            "Resource": "*"
        }
    ]
 }

 Install the Lambda in the region of the Bucket, Python 2.7, 1mn max execution time.
 Change "settings.json" to add or remove extension you want to compress
 Trigger is S3, PUT event (select the bucket where the lambda apply), output is S3 and Cloudwatch Logs.

 How it works :
 - on each PUT event (a new file is uploaded on the bucket), an event si sent to the lambda function (note : it doesnt work with a multipart upload).
 - the lambda wake up, and analyze the incomming file
 - read metadata of the incomming file
 - if the file is have the "gzip" HTTP meta ContentEncoding header, it meens it is already compressed, so there is no need to recompress it
 - if the file is too small (hard coded : 1024 byte) : no compression
 - if the file is not a rucognized extension (see settings.json) : no compression
 - if the file pass all this previous check, it is dowloaded locally (in /tmp)
 - gzip the local vesion by using the local os "gzip" tool (could be improved by using the internal python gzip feature - TODO)
 - overwrite the file in the bucket with the locally gzipped version
 - update metadata with previous + ContentEncoding setted to "gzip"
 - delete the locally gzipped version
 '''
 import json
 import pprint
 import boto3
 import botocore
 import tempfile
 import os
 import subprocess
 import fnmatch

 def lambda_handler(event, context):
    with open("settings.json") as json_data:
        settings = json.load(json_data)
    #print "EVENT :"
    client = boto3.client('s3')
    s3 = boto3.resource('s3')
    for r in event.get('Records'):
    #    pprint.pprint(r)
        bucketName = r.get('s3').get('bucket').get('name')
        objectKey = r.get('s3').get('object').get('key')
        etag = r.get('s3').get('object').get('eTag')
        print "Retreiving object :"
        print " bucketname = " + bucketName
        print " objectKey = " + objectKey
        uploadedMeta = client.head_object(Bucket=bucketName, Key=objectKey, IfMatch=etag)
        contentEncoding = uploadedMeta.get('ContentEncoding', None)
        size = uploadedMeta.get('ContentLength', 0)
        print " Current encoding = " + str(contentEncoding)
        print " Size = " + str(size)
        if (contentEncoding == 'gzip'):
            print(" ==> File is already compressed")
            return True
        match = False
        for ext in settings['extensions']:
            if fnmatch.fnmatch(objectKey, ext):
                match = True
                break
        if (match == False):
            print(" ==> File extension is not activated for compression. See settings.json")
            return True
        if (size < 1024):
            print(" ==> File is too small to be compressed")
            return True
        tmp_in = tempfile.mkdtemp()+'.orig'
        tmp_out = tmp_in + '.gz' # must be .gz because it is gzip default file creation
        new_objectKey = objectKey + '.gz' # name in S3
        print("Download content to " + tmp_in + " and gzip it to " + tmp_out)
        s3.Bucket(bucketName).download_file(objectKey, tmp_in)
        print("GZipping file")
        print subprocess.check_output(['gzip', '-v', '-f', '-9', tmp_in]) #  gzip command create .gz file 
        statinfo = os.stat(tmp_out)
        newsize = statinfo.st_size
        print "New gzipped file = " + str(statinfo.st_size)
        if (size - newsize < 1024 ):
            print "Compression is not efficient, keep original file"
            return True
        print "Overwritting S3 file with gzipped version"
        # Recreate metadata from original file (including http headers)
        # Todo : keep original upload date
        extraArgs = {
            'ContentEncoding':"gzip"
        }
        for m in ['Metadata', 'CacheControl', 'ContentDisposition', 'ContentLanguage', 'ContentType', 'Expires']:
            if (uploadedMeta.get(m, None) != None):
                extraArgs[m] = uploadedMeta.get(m)
        extraArgs['Metadata']['lambda'] = os.environ.get('AWS_LAMBDA_FUNCTION_NAME', '')
        extraArgs['Metadata']['originak-size'] = str(size)
        s3.Object(bucketName, objectKey).upload_file(
            Filename=tmp_out,
            ExtraArgs=extraArgs)
        # remove local file
        os.remove(tmp_out) 
    return {
        'statusCode': 200,
        'body': 'It works'
    }
	###
	### This gist contains 2 files : settings.json and lambda_function.py
	###

	### settings.json
	{
	"extensions" : [".hdr", ".glb", "*.wasm"]
	}

	### lambda_function.py
	'''
	This script convert an uncompressed S3 file into a gzipped compressed file. File is replaced, original file is deleted (replaced by the gz version)
	Create a role with S3 (read/write) and Cloudwatch Logs access :
	{
	"Version": "2012-10-17",
	"Statement": [
	{
	"Sid": "VisualEditor0",
	"Effect": "Allow",
	"Action": [
	"logs:CreateLogStream",
	"s3:*",
	"logs:PutLogEvents"
	],
	"Resource": "*"
	},
	{
	"Sid": "VisualEditor1",
	"Effect": "Allow",
	"Action": "logs:CreateLogGroup",
	"Resource": "*"
	}
	]
	}

	Install the Lambda in the region of the Bucket, Python 2.7, 1mn max execution time.
	Change "settings.json" to add or remove extension you want to compress
	Trigger is S3, PUT event (select the bucket where the lambda apply), output is S3 and Cloudwatch Logs.

	How it works :
	- on each PUT event (a new file is uploaded on the bucket), an event si sent to the lambda function (note : it doesnt work with a multipart upload).
	- the lambda wake up, and analyze the incomming file
	- read metadata of the incomming file
	- if the file is have the "gzip" HTTP meta ContentEncoding header, it meens it is already compressed, so there is no need to recompress it
	- if the file is too small (hard coded : 1024 byte) : no compression
	- if the file is not a rucognized extension (see settings.json) : no compression
	- if the file pass all this previous check, it is dowloaded locally (in /tmp)
	- gzip the local vesion by using the local os "gzip" tool (could be improved by using the internal python gzip feature - TODO)
	- overwrite the file in the bucket with the locally gzipped version
	- update metadata with previous + ContentEncoding setted to "gzip"
	- delete the locally gzipped version
	'''
	import json
	import pprint
	import boto3
	import botocore
	import tempfile
	import os
	import subprocess
	import fnmatch

	def lambda_handler(event, context):
	with open("settings.json") as json_data:
	settings = json.load(json_data)
	#print "EVENT :"
	client = boto3.client('s3')
	s3 = boto3.resource('s3')
	for r in event.get('Records'):
	# pprint.pprint(r)
	bucketName = r.get('s3').get('bucket').get('name')
	objectKey = r.get('s3').get('object').get('key')
	etag = r.get('s3').get('object').get('eTag')
	print "Retreiving object :"
	print " bucketname = " + bucketName
	print " objectKey = " + objectKey
	uploadedMeta = client.head_object(Bucket=bucketName, Key=objectKey, IfMatch=etag)
	contentEncoding = uploadedMeta.get('ContentEncoding', None)
	size = uploadedMeta.get('ContentLength', 0)
	print " Current encoding = " + str(contentEncoding)
	print " Size = " + str(size)
	if (contentEncoding == 'gzip'):
	print(" ==> File is already compressed")
	return True
	match = False
	for ext in settings['extensions']:
	if fnmatch.fnmatch(objectKey, ext):
	match = True
	break
	if (match == False):
	print(" ==> File extension is not activated for compression. See settings.json")
	return True
	if (size < 1024):
	print(" ==> File is too small to be compressed")
	return True
	tmp_in = tempfile.mkdtemp()+'.orig'
	tmp_out = tmp_in + '.gz' # must be .gz because it is gzip default file creation
	new_objectKey = objectKey + '.gz' # name in S3
	print("Download content to " + tmp_in + " and gzip it to " + tmp_out)
	s3.Bucket(bucketName).download_file(objectKey, tmp_in)
	print("GZipping file")
	print subprocess.check_output(['gzip', '-v', '-f', '-9', tmp_in]) # gzip command create .gz file
	statinfo = os.stat(tmp_out)
	newsize = statinfo.st_size
	print "New gzipped file = " + str(statinfo.st_size)
	if (size - newsize < 1024 ):
	print "Compression is not efficient, keep original file"
	return True
	print "Overwritting S3 file with gzipped version"
	# Recreate metadata from original file (including http headers)
	# Todo : keep original upload date
	extraArgs = {
	'ContentEncoding':"gzip"
	}
	for m in ['Metadata', 'CacheControl', 'ContentDisposition', 'ContentLanguage', 'ContentType', 'Expires']:
	if (uploadedMeta.get(m, None) != None):
	extraArgs[m] = uploadedMeta.get(m)
	extraArgs['Metadata']['lambda'] = os.environ.get('AWS_LAMBDA_FUNCTION_NAME', '')
	extraArgs['Metadata']['originak-size'] = str(size)
	s3.Object(bucketName, objectKey).upload_file(
	Filename=tmp_out,
	ExtraArgs=extraArgs)
	# remove local file
	os.remove(tmp_out)
	return {
	'statusCode': 200,
	'body': 'It works'
	}