Created
November 24, 2022 13:08
-
-
Save prerakmody/9237b618c804ca9b99c1fd21e30de496 to your computer and use it in GitHub Desktop.
Histopathology Image Reading
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
CAMELYON 16 DATASET | |
- Whole Slide Images (WSI) containing histopathological information on breast cancer | |
1. Download | |
- To view the list of AWS | |
- Link: https://aws.amazon.com/marketplace/pp/prodview-exkvqrznup6vc?sr=0-1&ref_=beagle&applicationId=AWSMPContessa#resources | |
- Click on Resources on AWS --> View Resources | |
- Single Sample | |
- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/images/tumor_032.tif ./raw/tumor_032.tif | |
- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/tumor_032_mask.tif ./raw/tumor_032_mask.tif | |
- aws s3 cp --no-sign-request s3://camelyon-dataset/CAMELYON16/annotations/tumor_032.xml ./raw/tumor_032.xml | |
- Full dataset | |
- aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/images/ ./raw/ # 700GB | |
- aws s3 cp --recursive --no-sign-request s3://camelyon-dataset/CAMELYON16/masks/ ./raw/ # 8.76GB | |
2. To view | |
- Download ASAP: https://github.com/computationalpathologygroup/ASAP/releases | |
- Ensure that it corresponds with your python version if you want to do programmatic access | |
- Make sure that the ASAP bin path (e.g. C:\Program Files\ASAP 2.1\bin) is either in your sys.path | |
""" | |
# Import ASAP lib first! | |
import sys | |
sys.path.append('C:\\Program Files\\ASAP 2.1\\bin') | |
import multiresolutionimageinterface as mir | |
reader = mir.MultiResolutionImageReader() | |
# Import public libs | |
import pdb | |
import tqdm | |
import time | |
import json | |
import shutil | |
import traceback | |
import numpy as np | |
from pathlib import Path | |
import matplotlib.pyplot as plt | |
# Init keys - paths | |
DATASET_NAME = 'CAMELYON16' | |
DIRNAME_RAW = 'raw' | |
DIRNAME_TMP = '_tmp' | |
DIR_RAW = Path(__file__).parent.absolute().joinpath(DIRNAME_RAW) | |
DIR_TMP = Path(__file__).parent.absolute().joinpath(DIRNAME_TMP) | |
Path(DIR_TMP).mkdir(exist_ok=True, parents=True) | |
# Init keys - filenames | |
EXT_TIF = '.tif' | |
POSTFIX_MASK = 'mask.tif' | |
POSTFIX_INFO_JSON = 'info.json' | |
FILENAME_IMAGES = '{}_{:03d}.tif' | |
FILENAME_MASKS = '{}_{:03d}_mask.tif' | |
DESCRIPTOR_PATCH = '{}_{:03d}__Lvl{}__P{}x{}-pad{}__perc{:.2f}' | |
FILENAME_INFO = DESCRIPTOR_PATCH + '__info.json' | |
FILETYPE_TRAIN_NORMAL = 'normal' | |
FILETYPE_TRAIN_TUMOR = 'tumor' | |
FILETYPE_TEST = 'test' | |
# Init keys - classes | |
CLASS_BACKGROUND = 0 | |
CLASS_NORMAL = 1 | |
CLASS_TUMOR = 2 | |
# Init keys - miscellaneous | |
KEY_POINTS = 'points' | |
KEY_POINTS_TOTAL = 'points_total' | |
KEY_POINTS_TISSUE_TOTAL = 'points_tissue_total' | |
KEY_POINTS_TUMOR_TOTAL = 'points_tumor_total' | |
KEY_POINTS_TISSUE = 'points_tissue' | |
KEY_POINTS_TUMOR = 'points_tumor' | |
KEY_TRAIN = 'train' | |
KEY_EVAL = 'eval' | |
KEY_PATIENT_ID = 'patient_id' | |
KEY_PATIENT_TYPE = 'patient_type' | |
KEY_PATIENT_LEVEL = 'patient_level' | |
KEY_PATIENT_MINTISSUE_PERC = 'patient_tissue_perc' | |
KEY_PATCH_WIDTH = 'patch_width' | |
KEY_PATCH_HEIGHT = 'patch_height' | |
KEY_PATCH_PAD = 'patch_pad' | |
KEY_MIN_TISSUE_PERC = 'min_tissue_perc' | |
KEY_SAVE_IMGS = 'save_imgs' | |
KEY_TOTAL_LEVELS = 'total_levels' | |
KEY_MAX_IMG_W = 'max_img_w' | |
KEY_MAX_IMG_H = 'max_img_h' | |
KEY_MODE = 'mode' | |
KEY_PATCHES_TRAIN = 'key_patches_train' | |
KEY_TUMOR_PERC_TRAIN = 'tumor_perc_train' | |
KEY_DATASET_TYPE = 'dataset_type' | |
KEY_DATASET_TRAIN = 'dataset_train' | |
KEY_DATASET_TEST = 'dataset_test' | |
######### coordinate convention | |
# | (0,0) | |
# | ----------------------> (w) | |
# | | |
# | | |
# | | |
# V (h) | |
def parse_patient(params): | |
""" | |
# Extract a level-based .json file on (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image) | |
""" | |
t0 = time.time() | |
try: | |
# Step 0 - Params | |
patient_id = params[KEY_PATIENT_ID] | |
patient_type = params[KEY_PATIENT_TYPE] | |
patient_level = params[KEY_PATIENT_LEVEL] | |
patch_width = params[KEY_PATCH_WIDTH] | |
patch_height = params[KEY_PATCH_HEIGHT] | |
patch_pad = params[KEY_PATCH_PAD] | |
total_patch_pixels = patch_width * patch_height | |
min_tissue_perc = params[KEY_MIN_TISSUE_PERC] | |
save_imgs = params[KEY_SAVE_IMGS] | |
# Step 1 - Read Data | |
path_img = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id)) | |
path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id)) | |
path_exists = True | |
if Path(path_img).exists() and Path(path_mask).exists(): | |
wsi_img = reader.open(str(path_img)) | |
wsi_mask = reader.open(str(path_mask)) | |
ds_factor = wsi_mask.getLevelDownsample(patient_level) | |
# Step 2 - Grid the WSI and save the (hmin, wmin) coords | |
img_max_w, img_max_h = wsi_img.getLevelDimensions(patient_level) | |
points_w, points_h = np.meshgrid(np.linspace(0, img_max_w, int(img_max_w//(patch_width - patch_pad))+1), np.linspace(0, img_max_h, int(img_max_h//(patch_height - patch_pad))+1)) # create a grid for ((patch_width - patch_pad) | |
points_w, points_h = points_w.astype(int), points_h.astype(int) | |
patches_total = len(points_w.flatten()) | |
# Step 3 - Prep for saving | |
points_tissue = [] | |
points_tumor = [] | |
DIR_TMP_PATIENT = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total)) | |
Path(DIR_TMP_PATIENT).mkdir(exist_ok=True, parents=True) | |
with tqdm.tqdm(total=patches_total, leave=False, desc=' - [{}] '.format(FILENAME_IMAGES.format(patient_type, patient_id))) as pbar_patient: | |
for patch_id, (point_w, point_h) in enumerate(zip(points_w.flatten(), points_h.flatten())): | |
# Step 3.1 - Get mask patch | |
wsi_patch_mask = np.array(wsi_mask.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level)) | |
# Step 3.2 - Check if mask patch contains > min_tissue_perc | |
wsi_patch_mask_tumor_bool = CLASS_TUMOR in wsi_patch_mask | |
wsi_patch_mask_tissueperc = np.count_nonzero(wsi_patch_mask) / total_patch_pixels | |
if wsi_patch_mask_tissueperc >= min_tissue_perc: | |
points_tissue.append([point_w, point_h]) | |
if save_imgs: | |
tumor_str = '' | |
show_perc = 0.05 | |
if wsi_patch_mask_tumor_bool: | |
tumor_str = '__Tumor' | |
show_perc = 1.0 | |
if np.random.random() < show_perc: | |
wsi_patch_image = np.array(wsi_image.getUCharPatch(int((point_w - patch_pad//2) * ds_factor), int((point_h - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level)) | |
f,axarr = plt.subplots(1,2) | |
axarr[0].imshow(wsi_patch_image) | |
# axarr[1].imshow(wsi_patch_image) | |
op = axarr[1].imshow(wsi_patch_mask, cmap='magma', alpha=1.0, vmin=CLASS_BACKGROUND, vmax=CLASS_TUMOR) | |
plt.colorbar(op, ax=axarr.ravel().tolist()) | |
plt.suptitle('Level={} \n (patch=({}, {})(pad={}) from img=({},{})) \n Clases=(Bgd={}, Normal={}, Tumor={})'.format(patient_level, patch_width, patch_height, patch_pad, img_max_w, img_max_h, CLASS_BACKGROUND, CLASS_NORMAL, CLASS_TUMOR)) | |
plt.savefig(str(DIR_TMP_PATIENT.joinpath('{}-{:03d}__{:06d}-{:06d}__{:.3f}{}.png'.format(patient_type, patient_id, int(point_w), int(point_h), wsi_patch_mask_tissueperc, tumor_str)))) | |
plt.close() | |
# print (' - [Lvl:{}][{}/{}] (point_w, point_h): {:06d}, {:06d} || perc: {:.3f}'.format(LEVEL, patch_id, patches_total, point_w, point_h, wsi_patch_mask_tissueperc)) | |
if wsi_patch_mask_tumor_bool: | |
# print (' --- Tumor!') | |
points_tumor.append([point_w, point_h]) | |
pbar_patient.update(1) | |
else: | |
print (' - \n [ERROR][parse_patient()] Path issues: patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id)) | |
print (' -- path_img : ', path_img) | |
print (' -- path_mask: ', path_mask) | |
print ('') | |
path_exists = False | |
# Step 4 - Finalize | |
if path_exists: | |
points_tissue = np.sort(np.array(points_tissue), axis=1) | |
if len(points_tumor): | |
points_tumor = np.sort(np.array(points_tumor), axis=1) | |
else: | |
points_tumor = np.array(points_tumor) | |
if save_imgs: | |
print (' - Total Patches = ', patches_total) | |
print (' - Total patches(tissue) = ', points_tissue.shape) | |
print (' - Total patches(tumor) = ', points_tumor.shape) | |
print (' - Total time taken : {:.2f}'.format(time.time() - t0) ) | |
if 1: | |
DIR_TMP_PATIENT2 = Path(DIR_TMP).joinpath((DESCRIPTOR_PATCH + '__N{}-{}-{}').format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc, patches_total, len(points_tissue), len(points_tumor))) | |
if Path(DIR_TMP_PATIENT2).exists(): | |
shutil.rmtree(DIR_TMP_PATIENT2) | |
shutil.move(src=str(DIR_TMP_PATIENT), dst=str(DIR_TMP_PATIENT2)) | |
# Step 5 - Save level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image) | |
path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc)) | |
data_json = { | |
KEY_PATIENT_TYPE : patient_type | |
, KEY_PATIENT_ID : patient_id | |
, KEY_PATIENT_LEVEL : patient_level | |
, KEY_TOTAL_LEVELS : wsi_img.getNumberOfLevels() | |
, KEY_MAX_IMG_W : img_max_w | |
, KEY_MAX_IMG_H : img_max_h | |
, KEY_PATCH_WIDTH : patch_width | |
, KEY_PATCH_HEIGHT : patch_height | |
, KEY_PATCH_PAD : patch_pad | |
, KEY_MIN_TISSUE_PERC : min_tissue_perc | |
, KEY_POINTS_TOTAL : patches_total | |
, KEY_POINTS_TISSUE_TOTAL : len(points_tissue) | |
, KEY_POINTS_TUMOR_TOTAL : len(points_tumor) | |
, KEY_POINTS_TISSUE : points_tissue.tolist() | |
, KEY_POINTS_TUMOR : points_tumor.tolist() | |
} | |
with open(str(path_json), 'w') as fp: | |
json.dump(data_json, fp, indent=4) | |
except: | |
print ('\n - [ERROR][parse_patient()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id)) | |
traceback.print_exc() | |
print ('\n - [ERROR][parse_patient()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id)) | |
def get_patient_patches(params): | |
try: | |
res = [] | |
# Step 1 - Params - Patient | |
patient_id = params[KEY_PATIENT_ID] | |
patient_type = params[KEY_PATIENT_TYPE] | |
patient_level = params[KEY_PATIENT_LEVEL] | |
min_tissue_perc = params[KEY_MIN_TISSUE_PERC] | |
patch_width = params[KEY_PATCH_WIDTH] | |
patch_height = params[KEY_PATCH_HEIGHT] | |
patch_pad = params[KEY_PATCH_PAD] | |
mode = params[KEY_MODE] # [KEY_TRAIN, KEY_EVAL] | |
patches_train = params[KEY_PATCHES_TRAIN] | |
patches_perc_tumor_train = params[KEY_TUMOR_PERC_TRAIN] | |
# Step 2 - Read .json file | |
path_json = Path(DIR_RAW).joinpath(FILENAME_INFO.format(patient_type, patient_id, patient_level, patch_width, patch_height, patch_pad, min_tissue_perc)) | |
if not Path(path_json).exists(): | |
parse_patient(params) | |
if Path(path_json).exists(): | |
with open(str(path_json), 'r') as fp: | |
json_data = json.load(fp) | |
# Step 2.2 - Extract vals from .json file | |
points_tissue = json_data[KEY_POINTS_TISSUE] | |
points_tumor = json_data[KEY_POINTS_TUMOR] | |
# Step 2.3 - Extract random patches if in training mode | |
if mode == KEY_TRAIN: | |
for _ in range(patches_train): | |
if len(points_tumor): | |
if np.random.random() < patches_perc_tumor_train: | |
idx = np.random.randint(0, len(points_tumor)) | |
points = points_tumor[idx] | |
else: | |
idx = np.random.randint(0, len(points_tissue)) | |
points = points_tissue[idx] | |
else: | |
idx = np.random.randint(0, len(points_tissue)) | |
points = points_tissue[idx] | |
points = np.array(points) + np.random.randint(0,patch_pad,2) | |
res.append(points.tolist()) | |
elif mode == KEY_EVAL: | |
res = points_tissue | |
else: | |
print (' - \n [ERROR][get_patient_patches()] Path issues: patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id)) | |
print (' -- path_json : ', path_json) | |
print ('') | |
except: | |
print ('\n - [ERROR][get_patient_patches()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id)) | |
traceback.print_exc() | |
print ('\n - [ERROR][get_patient_patches()] patient_type: {} | patient_id: {} \n'.format(patient_type, patient_id)) | |
return res | |
def generator(params): | |
try: | |
# Step 0 - Init | |
res = {} | |
# Step 1 - Get (wmin,hmin) for patient patches | |
# Step 1.1 - Get paths as per KEY_DATASET_TYPE | |
dataset_type = params[KEY_DATASET_TYPE] | |
patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]] | |
if dataset_type == KEY_TRAIN: | |
patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TRAIN_NORMAL in Path(each).parts[-1] or FILETYPE_TRAIN_TUMOR in Path(each).parts[-1])] | |
else: | |
patient_paths_imgs = [each for each in patient_paths_imgs if (FILETYPE_TEST in Path(each).parts[-1])] | |
# Step 1.2 - Loop over the paths and get (wmin,hmin) for patches | |
with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar: | |
for patient_path_img in patient_paths_imgs: | |
patient_id = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0]) | |
patient_type = Path(patient_path_img).parts[-1].split('_')[0] | |
params[KEY_PATIENT_ID] = patient_id | |
params[KEY_PATIENT_TYPE] = patient_type | |
res[FILENAME_IMAGES.format(patient_type, patient_id)] = {KEY_POINTS: get_patient_patches(params), KEY_PATIENT_ID: patient_id, KEY_PATIENT_TYPE: patient_type} | |
res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS_TOTAL] = len(res[FILENAME_IMAGES.format(patient_type, patient_id)][KEY_POINTS]) | |
# Step 2 - Loop over the patch points | |
patch_pad = params[KEY_PATCH_PAD] | |
patch_width = params[KEY_PATCH_WIDTH] | |
patch_height = params[KEY_PATCH_HEIGHT] | |
patient_level = params[KEY_PATIENT_LEVEL] | |
SAMPLES_TOTAL = sum(patient_obj[KEY_POINTS_TOTAL] for patient_obj in res.values()) | |
with tqdm.tqdm(total=SAMPLES_TOTAL) as pbar_generator: | |
for patient_key in res: | |
patient_id = res[patient_key][KEY_PATIENT_ID] | |
patient_type = res[patient_key][KEY_PATIENT_TYPE] | |
path_img = Path(DIR_RAW).joinpath(FILENAME_IMAGES.format(patient_type, patient_id)) | |
wsi_img = reader.open(str(path_img)) | |
path_mask = Path(DIR_RAW).joinpath(FILENAME_MASKS.format(patient_type, patient_id)) | |
wsi_mask = reader.open(str(path_mask)) | |
ds_factor = wsi_mask.getLevelDownsample(patient_level) | |
for point in res[patient_key][KEY_POINTS]: | |
wsi_patch_mask = np.array(wsi_mask.getUCharPatch(int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level)) | |
wsi_patch_img = np.array(wsi_img.getUCharPatch( int((point[0] - patch_pad//2) * ds_factor), int((point[1] - patch_pad//2) * ds_factor), patch_width, patch_height, patient_level)) | |
pbar_generator.update(1) | |
yield(wsi_patch_img, wsi_patch_mask) | |
except: | |
print ('\n - [ERROR][generator()] ') | |
traceback.print_exc() | |
pdb.set_trace() | |
if __name__ == "__main__": | |
try: | |
if 1: | |
params = { | |
KEY_PATIENT_LEVEL : 2 | |
, KEY_MIN_TISSUE_PERC: 0.1 | |
, KEY_PATCH_WIDTH : 512 | |
, KEY_PATCH_HEIGHT : 512 | |
, KEY_PATCH_PAD : 32 | |
, KEY_SAVE_IMGS : False | |
, KEY_DATASET_TYPE : KEY_DATASET_TRAIN # [KEY_DATASET_TRAIN->[KEY_TRAIN, KEY_EVAL], KEY_DATASET_TEST->[KEY_EVAL]] | |
, KEY_MODE : KEY_TRAIN # [KEY_TRAIN, KEY_EVAL] # in train we have 270 WSIs, in test we have 129WSIs | |
, KEY_PATCHES_TRAIN : 1000 | |
, KEY_TUMOR_PERC_TRAIN : 0.5 | |
} | |
# Step 1 - Extract a level-based .json containing (wmin,hmin) on the basis of KEY_PATIENT_MINTISSUE_PERC for a WSI (whole-slide-image) | |
if 0: | |
patient_paths_imgs = [each for each in Path(DIR_RAW).glob('*') if POSTFIX_MASK not in each.parts[-1] and POSTFIX_INFO_JSON not in each.parts[-1]] | |
with tqdm.tqdm(total=len(patient_paths_imgs)) as pbar: | |
for patient_path_img in patient_paths_imgs: | |
params[KEY_PATIENT_ID] = int(Path(patient_path_img).parts[-1].split('_')[1].split(EXT_TIF)[0]) | |
params[KEY_PATIENT_TYPE] = Path(patient_path_img).parts[-1].split('_')[0] | |
parse_patient(params) | |
pbar.update(1) | |
# Step 2 - Use the level-based .json file and loop over the samples to understand the speed | |
if 1: | |
for (X,Y) in generator(params): | |
# print (X.shape, Y.shape) | |
# pdb.set_trace() | |
pass | |
except: | |
print ('\n - [__main__] ') | |
traceback.print_exc() | |
pdb.set_trace() | |
pdb.set_trace() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment