Created
August 30, 2019 03:26
-
-
Save dustindorroh/000b44f16527611c23ee8232f20474de to your computer and use it in GitHub Desktop.
Runs DHash on a csv containing paths to images using Dask. Deletes duplicates (prefering to keep larger images). Rename Images to Dhash name. Create AWS signed urls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import imagehash | |
from PIL import Image | |
from dask import dataframe as dd | |
from dask.diagnostics import ProgressBar | |
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, visualize | |
from pathlib import Path | |
import shutil | |
def create_presigned_url(bucket_name, object_name, expiration=3600): | |
"""Generate a presigned URL to share an S3 object | |
:param bucket_name: string | |
:param object_name: string | |
:param expiration: Time in seconds for the presigned URL to remain valid | |
:return: Presigned URL as string. If error, returns None. | |
""" | |
# Generate a presigned URL for the S3 object | |
s3_client = boto3.client('s3') | |
try: | |
response = s3_client.generate_presigned_url('get_object', | |
Params={'Bucket': bucket_name, | |
'Key': object_name}, | |
ExpiresIn=expiration) | |
except ClientError as e: | |
logging.error(e) | |
return None | |
# The response contains the presigned URL | |
return response | |
def rename(image_path,name): | |
path = Path(image_path) | |
new_path = path.with_name(name).with_suffix(path.suffix) | |
return new_path | |
def image_size(image_path): | |
'''Return image width,height,and number of pixels''' | |
image = Image.open(image_path) | |
return image.width,image.height,image.width*image.height | |
def calc_image_hash(image_path): | |
'''Calc dhash of image and return str of hex representation of binary array''' | |
image = Image.open(image_path) | |
image_dhash = imagehash.dhash(image) | |
return str(image_dhash) | |
with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar(): | |
dask_df = dd.read_csv('data.csv') # only needs an image_path column. | |
dask_df = dask_df.repartition(npartitions=os.cpu_count()) | |
image_size_df = dask_df.image_path.apply(image_size,meta=tuple) | |
dask_df['width'] = image_size_df.apply(lambda i: i[0],meta='width') | |
dask_df['height'] = image_size_df.apply(lambda i: i[1],meta='height') | |
dask_df['num_pixels'] = image_size_df.apply(lambda i: i[2],meta='num_pixels') | |
dask_df['image_dhash'] = dask_df.image_path.apply(calc_image_hash,meta=('image_dhash',str)) | |
df = dask_df.compute() | |
visualize([prof, rprof, cprof],file_path='/tmp/profile_dhash.html') | |
df = df.sort_values(by=['image_dhash','num_pixels'],ascending=False) | |
df['duplicated'] = df.duplicated(subset=['image_dhash'],keep='first') | |
remove_df = df.loc[df['duplicated']] | |
remove_df.image_path.apply(os.remove) | |
print('All images removed: {}'.format((~remove_df.image_path.apply(os.path.exists).all()))) | |
keep_df = df.loc[~df['duplicated']].copy() | |
# Rename files to dhash | |
keep_df['image_dhash_path'] = keep_df.apply(lambda row: rename(row.image_path,row.image_dhash),axis=1).astype(str) | |
keep_df.apply(lambda row: shutil.move(row.image_path,row.image_dhash_path), axis=1) | |
print('All images moved: {}'.format( | |
( (~keep_df.image_path.apply(os.path.exists) | |
& keep_df.image_dhash_path.apply(os.path.exists)).all()) )) | |
keep_df.image_path = keep_df.image_dhash_path.astype(str) # astype str b/c it maybe pathlib.Path | |
del keep_df['image_dhash_path'] | |
del keep_df['duplicated'] | |
# Create signed urls | |
bucket_name = 'my-bucket-name-here' | |
seconds_per_year = 31540000 | |
# This assumes that your image_paths were relative to your s3 bucket. | |
keep_df['image_signed_url'] = keep_df.image_path.apply(lambda p: create_presigned_url(bucket_name,p, seconds_per_year)) | |
# Save our work | |
keep_df.to_csv('data_dhash.csv',index=False) | |
''' | |
You will need to copy the files to the path we signed in the urls. | |
AWS CLI Example: | |
aws s3 sync relative/path/to/data s3://my-bucket-name-here/relative/path/to/data | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment