Skip to content

Instantly share code, notes, and snippets.

@yongjun823
Last active June 30, 2018 15:22
Show Gist options
  • Select an option

  • Save yongjun823/cc4b0a518e6048d7603376abfa96a3b6 to your computer and use it in GitHub Desktop.

Select an option

Save yongjun823/cc4b0a518e6048d7603376abfa96a3b6 to your computer and use it in GitHub Desktop.
open image csv read and Image Down using ThreadPool ++ blank image remove
import csv
import requests
import shutil
import os
import argparse
import numpy as np
import time
from tqdm import tqdm
from PIL import Image, ImageOps
from concurrent.futures import ThreadPoolExecutor
blank_data = Image.open('blank.jpg')
blank2_data = Image.open('blank2.jpg')
blank_arr = np.array(blank_data)
blank2_arr = np.array(blank2_data)
original_path = '/root/original/'
resize_path = '/root/resize/'
csv_path = 'main.csv'
pbar = None
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def blank_check(name, pil_data):
pil_data = pil_data.convert('L')
size_data = None
if pil_data.size == blank_data.size:
size_data = blank_arr
elif pil_data.size == blank2_data.size:
size_data = blank2_arr
else:
return False
image_differ = np.sum(np.array(pil_data) - size_data)
# same case --> True
if image_differ == 0:
os.remove('{}{}'.format(original_path, name))
return True
else:
return False
def resize_image(image_name):
img = Image.open(original_path + image_name)
if blank_check(image_name, img):
return
fit_img = ImageOps.fit(img, (256, 256), Image.ANTIALIAS)
fit_img.save(original_path + image_name)
fit_img = ImageOps.fit(img, (28, 28), Image.ANTIALIAS)
fit_img.save(resize_path + image_name)
def read_csv_data(csv_path):
data_arr = []
with open(csv_path, 'r') as theFile:
reader = csv.DictReader(theFile)
for line in reader:
data_arr.append(line)
return data_arr
def squareNumber(item):
r = requests.get(item['original_url'], stream=True)
if r.status_code == 200:
with open('{}{}.jpg'.format(original_path, item['image_id']), 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
resize_image('{}.jpg'.format(item['image_id']))
pbar.update(1)
# function to be mapped over
def calculateParallel(arr, threads=2):
with ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(squareNumber, arr)
def pre_image_check():
prev_down_files = os.listdir(resize_path)
original_files = os.listdir(original_path)
for name in os.listdir(original_path):
if not name in prev_down_files:
print(name + 'delete')
os.remove('{}{}'.format(original_path, name))
print(name)
if __name__ == "__main__":
pre_image_check()
temp_arr = read_csv_data(csv_path)
unique_arr = []
original_files = os.listdir(original_path)
for item in temp_arr:
if not '{}.jpg'.format(item['image_id']) in original_files:
unique_arr.append(item)
print(len(unique_arr))
for tt in list(chunks(unique_arr, 1000)):
pbar = tqdm(total=len(tt))
calculateParallel(tt, 200)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment