Last active
January 1, 2022 22:07
-
-
Save ashishrana160796/3462546c55e6e2dd71d9dd65c571bcb2 to your computer and use it in GitHub Desktop.
Load Images from BBBC dataset(Here, https://data.broadinstitute.org/bbbc/BBBC005/) and Resize them with PIL python library along with changes their size & for JPEG. After, that split them into test and train directories depending on requirement add conditions for test/train sets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Best way to run copy and paste portions into an ipython3 shell | |
# 1. Import Statements | |
import operator | |
from os import listdir | |
from os.path import isfile, join | |
# 2. Select files with matching tiff extensions. | |
import re | |
pattern = re.compile(".+_w1.TIF") | |
# 3. Currently ground truth images are loaded. For loading complete dataset uncomment the line below myath | |
mypath="BBBC005_v1_ground_truth/BBBC005_v1_ground_truth/" | |
# mypath="BBBC005_v1_images/" | |
# for 19,200 images dataset | |
file_names = [f for f in listdir(mypath) if (isfile(join(mypath, f)) and pattern.match(str(f)))] | |
# check length of file | |
len(file_names) | |
# 4. extract count from respective image names and store them in a list | |
file_count=[int(re.search('_C(.+?)_',f).group(1)) for f in file_names] | |
# 5. final dictionary data structure created. | |
file_dict=dict(zip(file_names, file_count)) | |
# 6. get-max count, from the given images | |
max(file_dict.items(), key=operator.itemgetter(1)) | |
# Important | |
# 7. change format to JPEG, reduce size to third of images that are loaded. | |
import os | |
from PIL import Image | |
# make sure following directory is not there before, otherwise statement will result in error | |
data_dest="data_set/" | |
os.mkdir(data_dest) | |
# looping for changing the file formats | |
for file_name in file_names: | |
full_file_name = os.path.join(mypath, file_name) | |
print (full_file_name) | |
outfile = os.path.splitext(file_name)[0] + ".jpg" | |
im = Image.open(full_file_name) | |
print ("Generating jpeg for %s" % file_name) | |
im.thumbnail((im.size[0]/3, im.size[1]/3), Image.ANTIALIAS) | |
im.save(data_dest+outfile, "JPEG", quality=72) | |
# 8. split to test/train-75/25 ratio shuffled ratio and create test and train dataset directories respectively. | |
# shuffle dictionary with random shuffle of keys and access in that manner | |
import random | |
import re | |
new_pattern = re.compile(".+.jpg") | |
# similar script portion as above to load jpg images. | |
new_path="data_set/" | |
new_file_names = [f for f in listdir(new_path) if (isfile(join(new_path, f)) and new_pattern.match(str(f)))] | |
# randomness added. | |
random.shuffle(new_file_names) | |
# make directory where train & test dataset will get created. | |
# make sure following directory is not there before, otherwise statement will result in error | |
import shutil | |
train_dest="train_set/" | |
os.mkdir(train_dest) | |
test_dest="test_set/" | |
os.mkdir(test_dest) | |
# splitting into different datasets | |
split_point=0.75 | |
i=0 | |
for key in new_file_names: | |
full_file_name = os.path.join(new_path, key) | |
if i <= 0.75*len(new_file_names): | |
shutil.copy(full_file_name, train_dest) | |
else: | |
shutil.copy(full_file_name, test_dest) | |
i=i+1 | |
# ---------------------------------------------------------------------------- | |
# END | |
# ---------------------------------------------------------------------------- | |
# 9. Extra preprocessing step for afterwards analysis, split on the basis of count of nucleis. | |
# Split to test/train-50/50 ratio with lesser count half for training and rest for testing. | |
# Only (1/2) the dataset for training: the lower half of counts of nucleis. | |
# The higher half of the counts for prediction. | |
# import os | |
# import shutil | |
# make sure following directory is not there before, otherwise statement will result in error | |
# train_dest="train_set/" | |
# os.mkdir(train_dest) | |
# test_dest="test_set/" | |
# os.mkdir(test_dest) | |
# for file_name in file_names: | |
# full_file_name = os.path.join(mypath, file_name) | |
# if (os.path.isfile(full_file_name) and file_dict.get(file_name)<=50): | |
# # count being criteria of seperation, not number of entries | |
# shutil.copy(full_file_name, train_dest) | |
# else: | |
# shutil.copy(full_file_name, test_dest) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Threads issue addressed with this gist: