dustindorroh · August 30, 2019 03:26
diff --git a/image_deduplication_and_url_signing.py b/image_deduplication_and_url_signing.py
 import pandas as pd
 import imagehash
 from PIL import Image
 from dask import dataframe as dd
 from dask.diagnostics import ProgressBar
 from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
 from pathlib import Path
 import shutil

 def create_presigned_url(bucket_name, object_name, expiration=3600):
    """Generate a presigned URL to share an S3 object

    :param bucket_name: string
    :param object_name: string
    :param expiration: Time in seconds for the presigned URL to remain valid
    :return: Presigned URL as string. If error, returns None.
    """

    # Generate a presigned URL for the S3 object
    s3_client = boto3.client('s3')
    try:
        response = s3_client.generate_presigned_url('get_object',
                                                    Params={'Bucket': bucket_name,
                                                            'Key': object_name},
                                                    ExpiresIn=expiration)
    except ClientError as e:
        logging.error(e)
        return None

    # The response contains the presigned URL
    return response

 def rename(image_path,name):
    path = Path(image_path)
    new_path = path.with_name(name).with_suffix(path.suffix)
    return new_path
 

 def image_size(image_path):
    '''Return image width,height,and number of pixels'''
    image = Image.open(image_path)
    return image.width,image.height,image.width*image.height

 def calc_image_hash(image_path):
    '''Calc dhash of image and return str of hex representation of binary array'''
    image = Image.open(image_path)
    image_dhash = imagehash.dhash(image)
    return str(image_dhash)

 with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar():
    dask_df = dd.read_csv('data.csv') # only needs an image_path column.
    dask_df = dask_df.repartition(npartitions=os.cpu_count())
    image_size_df = dask_df.image_path.apply(image_size,meta=tuple)
    dask_df['width'] = image_size_df.apply(lambda i: i[0],meta='width')
    dask_df['height'] = image_size_df.apply(lambda i: i[1],meta='height')
    dask_df['num_pixels'] = image_size_df.apply(lambda i: i[2],meta='num_pixels')
    dask_df['image_dhash'] = dask_df.image_path.apply(calc_image_hash,meta=('image_dhash',str))
    df = dask_df.compute()
    visualize([prof, rprof, cprof],file_path='/tmp/profile_dhash.html')

 df = df.sort_values(by=['image_dhash','num_pixels'],ascending=False)
 df['duplicated'] = df.duplicated(subset=['image_dhash'],keep='first')

 remove_df = df.loc[df['duplicated']]
 remove_df.image_path.apply(os.remove)
 print('All images removed: {}'.format((~remove_df.image_path.apply(os.path.exists).all())))

 keep_df = df.loc[~df['duplicated']].copy()

 # Rename files to dhash
 keep_df['image_dhash_path'] = keep_df.apply(lambda row: rename(row.image_path,row.image_dhash),axis=1).astype(str)

 keep_df.apply(lambda row: shutil.move(row.image_path,row.image_dhash_path), axis=1)
 print('All images moved: {}'.format(
    ( (~keep_df.image_path.apply(os.path.exists) 
        & keep_df.image_dhash_path.apply(os.path.exists)).all()) ))

 keep_df.image_path = keep_df.image_dhash_path.astype(str) # astype str b/c it maybe pathlib.Path
 del keep_df['image_dhash_path']
 del keep_df['duplicated']

 # Create signed urls
 bucket_name = 'my-bucket-name-here'
 seconds_per_year = 31540000

 # This assumes that your image_paths were relative to your s3 bucket.
 keep_df['image_signed_url'] = keep_df.image_path.apply(lambda p: create_presigned_url(bucket_name,p, seconds_per_year))

 # Save our work 
 keep_df.to_csv('data_dhash.csv',index=False)

 '''
 You will need to copy the files to the path we signed in the urls. 
 AWS CLI Example: 
    aws s3 sync relative/path/to/data s3://my-bucket-name-here/relative/path/to/data
 '''
	import pandas as pd
	import imagehash
	from PIL import Image
	from dask import dataframe as dd
	from dask.diagnostics import ProgressBar
	from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize
	from pathlib import Path
	import shutil

	def create_presigned_url(bucket_name, object_name, expiration=3600):
	"""Generate a presigned URL to share an S3 object

	:param bucket_name: string
	:param object_name: string
	:param expiration: Time in seconds for the presigned URL to remain valid
	:return: Presigned URL as string. If error, returns None.
	"""

	# Generate a presigned URL for the S3 object
	s3_client = boto3.client('s3')
	try:
	response = s3_client.generate_presigned_url('get_object',
	Params={'Bucket': bucket_name,
	'Key': object_name},
	ExpiresIn=expiration)
	except ClientError as e:
	logging.error(e)
	return None

	# The response contains the presigned URL
	return response

	def rename(image_path,name):
	path = Path(image_path)
	new_path = path.with_name(name).with_suffix(path.suffix)
	return new_path


	def image_size(image_path):
	'''Return image width,height,and number of pixels'''
	image = Image.open(image_path)
	return image.width,image.height,image.width*image.height

	def calc_image_hash(image_path):
	'''Calc dhash of image and return str of hex representation of binary array'''
	image = Image.open(image_path)
	image_dhash = imagehash.dhash(image)
	return str(image_dhash)

	with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar():
	dask_df = dd.read_csv('data.csv') # only needs an image_path column.
	dask_df = dask_df.repartition(npartitions=os.cpu_count())
	image_size_df = dask_df.image_path.apply(image_size,meta=tuple)
	dask_df['width'] = image_size_df.apply(lambda i: i[0],meta='width')
	dask_df['height'] = image_size_df.apply(lambda i: i[1],meta='height')
	dask_df['num_pixels'] = image_size_df.apply(lambda i: i[2],meta='num_pixels')
	dask_df['image_dhash'] = dask_df.image_path.apply(calc_image_hash,meta=('image_dhash',str))
	df = dask_df.compute()
	visualize([prof, rprof, cprof],file_path='/tmp/profile_dhash.html')

	df = df.sort_values(by=['image_dhash','num_pixels'],ascending=False)
	df['duplicated'] = df.duplicated(subset=['image_dhash'],keep='first')

	remove_df = df.loc[df['duplicated']]
	remove_df.image_path.apply(os.remove)
	print('All images removed: {}'.format((~remove_df.image_path.apply(os.path.exists).all())))

	keep_df = df.loc[~df['duplicated']].copy()

	# Rename files to dhash
	keep_df['image_dhash_path'] = keep_df.apply(lambda row: rename(row.image_path,row.image_dhash),axis=1).astype(str)

	keep_df.apply(lambda row: shutil.move(row.image_path,row.image_dhash_path), axis=1)
	print('All images moved: {}'.format(
	( (~keep_df.image_path.apply(os.path.exists)
	& keep_df.image_dhash_path.apply(os.path.exists)).all()) ))

	keep_df.image_path = keep_df.image_dhash_path.astype(str) # astype str b/c it maybe pathlib.Path
	del keep_df['image_dhash_path']
	del keep_df['duplicated']

	# Create signed urls
	bucket_name = 'my-bucket-name-here'
	seconds_per_year = 31540000

	# This assumes that your image_paths were relative to your s3 bucket.
	keep_df['image_signed_url'] = keep_df.image_path.apply(lambda p: create_presigned_url(bucket_name,p, seconds_per_year))

	# Save our work
	keep_df.to_csv('data_dhash.csv',index=False)

	'''
	You will need to copy the files to the path we signed in the urls.
	AWS CLI Example:
	aws s3 sync relative/path/to/data s3://my-bucket-name-here/relative/path/to/data
	'''