Created
April 21, 2017 16:56
-
-
Save fedeisas/31fb64f5a82ab52054e479224c31f728 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from glob import glob | |
import piexif | |
import pandas as pd | |
from PIL import Image | |
import datetime, time | |
import numpy as np | |
from multiprocessing import Pool, Process, Value, Lock, current_process | |
import logging | |
reload(logging) | |
class Counter(object): | |
def __init__(self, initval=0): | |
self.val = Value('i', initval) | |
self.lock = Lock() | |
def increment(self): | |
with self.lock: | |
self.val.value += 1 | |
def value(self): | |
with self.lock: | |
return self.val.value | |
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) | |
# Get every filename into a list | |
result = [y for x in os.walk('.') for y in glob(os.path.join(x[0], '*.jpg'))] | |
################# | |
length = len(result) | |
counter = Counter() | |
# Create a DF | |
data = pd.DataFrame({'filename': result}) | |
logging.info('Done with loading dataset...') | |
def get_brigtness(im): | |
brightness = None | |
luminance_a = None | |
luminance_b = None | |
luminance_c = None | |
try: | |
pixels = np.array(im, dtype=np.int) | |
except IOError as e: | |
return (None, None, None, None) | |
try: | |
pixels = pixels.reshape(pixels.shape[0] * pixels.shape[1], 3) | |
except ValueError as e: | |
return (None, None, None, None) | |
brightness = np.average(pixels.sum(axis=1) / 3) | |
luminance_a = np.average(0.2126 * pixels[:, 0] + 0.7152 * pixels[:, 1] + 0.0722 * pixels[:, 2]) | |
luminance_b = np.average(0.299 * pixels[:, 0] + 0.587 * pixels[:, 1] + 0.114 * pixels[:, 2]) | |
luminance_c = np.average( | |
np.sqrt( | |
np.power(pixels[:, 0], 2) + | |
np.power(pixels[:, 1], 2) + | |
np.power(pixels[:, 2], 2) | |
) | |
) | |
return (brightness, luminance_a, luminance_b, luminance_c) | |
def get_image_data(f): | |
dpi = None | |
brightness = None | |
luminance_a = None | |
luminance_b = None | |
luminance_c = None | |
width = None | |
height = None | |
try: | |
im = Image.open(f) | |
if 'dpi' in im.info: | |
dpi = im.info['dpi'][0] | |
width, height = im.size | |
brightness, luminance_a, luminance_b, luminance_c = get_brigtness(im) | |
if counter.value() % 1000 == 0: | |
logging.info( | |
"Progress {0}% ({1} / {2}) | {3}".format( | |
float(counter.value()) / float(length) * 100, | |
counter.value(), | |
length, | |
current_process() | |
) | |
) | |
counter.increment() | |
except Exception as e: | |
logging.error("Can't process {0} | {1}".format(f, current_process())) | |
logging.error(e) | |
return ( | |
dpi, | |
width, | |
height, | |
brightness, | |
luminance_a, | |
luminance_b, | |
luminance_c | |
) | |
def parallelize_dataframe(df, func): | |
df_split = np.array_split(df, num_partitions) | |
pool = Pool(num_cores) | |
df = pd.concat(pool.map(func, df_split)) | |
pool.close() | |
pool.join() | |
return df | |
def more_columns(data): | |
logging.info("Batch started | {0}".format(current_process())) | |
data['dpi'], data['width'], data['height'], data['brightness'], data['luminance_a'], data['luminance_b'], data['luminance_c'] = zip(*data.filename.map(get_image_data)) | |
logging.info("Batch completed | {0}".format(current_process())) | |
return data | |
num_partitions = len(data) / 1000 | |
num_cores = 16 | |
data_full = parallelize_dataframe(data, more_columns) | |
data_full['created'] = data_full.filename.map(lambda f: datetime.datetime.fromtimestamp(os.path.getmtime(f))) | |
data_full['size'] = data_full.filename.map(lambda f: os.stat(f).st_size) | |
data_full.to_csv('output.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment