ahmedhosny · December 18, 2017 22:43
diff --git a/nrrd_to_hdf5.py b/nrrd_to_hdf5.py
 # assumes CT image nrrds with minimum value of -1024
 # assumes CT mask nrrds with 0's and 1's
 # assumes the shape of each patient data (iamge and mask) are different - therefore,
 # this will pad all images and masks to the size of the largest
 # does not preform any interpolation to isotrpic voxels or any normalization
 # only saves the image and mask, therefore the metadata and pixel spacing is lost

 import nrrd # pip install pynrrd # probably better performance with sitk
 import numpy as np
 import glob
 import h5py

 # input 
 # both folder should have the same number of files in the same order.. obviously..
 # folder with image nrrd files
 image_nrrd_folder = '/path/..'
 # folder with mask nrrd files
 mask_nrrd_folder = '/path/..'
 # output
 output = 'output.hdf5'
 # dataset name
 dataset = "someName"

 # substitute value if larger
 def addLargest(value,variable):
    if value>variable:
        variable=value
    return variable

 # globs
 images = glob.glob(image_nrrd_folder + '/*')
 masks = glob.glob(mask_nrrd_folder + '/*')
 # sanity
 assert(len(images)==len(masks))

 #
 # suggest interpolation and normalization to happen here
 #

 #get largest dims
 largest_dim_0 = 0
 largest_dim_1 = 0
 largest_dim_2 = 0
 # loop through both folders to get the largest shape
 for image,mask in zip(images,masks):
    # read image and mask nrrd
    image_nrrd = nrrd.read(image)
    mask_nrrd = nrrd.read(mask)
    # sanity
    assert(image_nrrd[0].shape==mask_nrrd[0].shape)
    # add largest
    largest_dim_0 = addLargest(image_nrrd[0].shape[0],largest_dim_0)
    largest_dim_1 = addLargest(image_nrrd[0].shape[1],largest_dim_1)
    largest_dim_2 = addLargest(image_nrrd[0].shape[2],largest_dim_2)
 print 'largest shape: ', largest_dim_0, largest_dim_1, largest_dim_2

 X = []
 Y = []
 # loop through both folders to populate the h5
 for image,mask in zip(images,masks):
    # read image and mask nrrd
    image_nrrd = nrrd.read(image)
    mask_nrrd = nrrd.read(mask)
    shape = image_nrrd[0].shape
    # make new -1024 and zero arrays 
    image_arr = np.full((largest_dim_0, largest_dim_1, largest_dim_2),-1024,dtype=np.float32)
    mask_arr = np.zeros((largest_dim_0, largest_dim_1, largest_dim_2),dtype=np.int16)
    # pad them and append
    image_arr[0:shape[0],0:shape[1],0:shape[2]] = image_nrrd[0]
    mask_arr[0:shape[0],0:shape[1],0:shape[2]] = mask_nrrd[0]
    X.append(image_arr)
    Y.append(mask_arr)
    print image, " shape is ", shape

 #
 # write h5
 #
 h5 = h5py.File(output, "w")
 # highest level heirarchy
 h5.create_dataset('X', dtype=np.float32, data=X)
 h5.create_dataset('Y', dtype=np.int16, data=Y)
 # close
 h5.close()

 #
 # read h5
 #
 h5 = h5py.File(output, "r")
 # convert into list of arrays + add a single channel
 obj = {'X':[],'Y':[]}
 obj['X']= list( np.expand_dims(np.array(h5['X']), axis=4).astype(np.float32) )
 obj['Y']= list( np.expand_dims(np.array(h5['Y']), axis=4).astype(np.int16) )
 print len(obj['X']), obj['X'][0].shape, len(obj['Y']), obj['Y'][0].shape
 # 2 (512, 512, 658, 1) 2 (512, 512, 658, 1)
	# assumes CT image nrrds with minimum value of -1024
	# assumes CT mask nrrds with 0's and 1's
	# assumes the shape of each patient data (iamge and mask) are different - therefore,
	# this will pad all images and masks to the size of the largest
	# does not preform any interpolation to isotrpic voxels or any normalization
	# only saves the image and mask, therefore the metadata and pixel spacing is lost

	import nrrd # pip install pynrrd # probably better performance with sitk
	import numpy as np
	import glob
	import h5py

	# input
	# both folder should have the same number of files in the same order.. obviously..
	# folder with image nrrd files
	image_nrrd_folder = '/path/..'
	# folder with mask nrrd files
	mask_nrrd_folder = '/path/..'
	# output
	output = 'output.hdf5'
	# dataset name
	dataset = "someName"

	# substitute value if larger
	def addLargest(value,variable):
	if value>variable:
	variable=value
	return variable

	# globs
	images = glob.glob(image_nrrd_folder + '/*')
	masks = glob.glob(mask_nrrd_folder + '/*')
	# sanity
	assert(len(images)==len(masks))

	#
	# suggest interpolation and normalization to happen here
	#

	#get largest dims
	largest_dim_0 = 0
	largest_dim_1 = 0
	largest_dim_2 = 0
	# loop through both folders to get the largest shape
	for image,mask in zip(images,masks):
	# read image and mask nrrd
	image_nrrd = nrrd.read(image)
	mask_nrrd = nrrd.read(mask)
	# sanity
	assert(image_nrrd[0].shape==mask_nrrd[0].shape)
	# add largest
	largest_dim_0 = addLargest(image_nrrd[0].shape[0],largest_dim_0)
	largest_dim_1 = addLargest(image_nrrd[0].shape[1],largest_dim_1)
	largest_dim_2 = addLargest(image_nrrd[0].shape[2],largest_dim_2)
	print 'largest shape: ', largest_dim_0, largest_dim_1, largest_dim_2

	X = []
	Y = []
	# loop through both folders to populate the h5
	for image,mask in zip(images,masks):
	# read image and mask nrrd
	image_nrrd = nrrd.read(image)
	mask_nrrd = nrrd.read(mask)
	shape = image_nrrd[0].shape
	# make new -1024 and zero arrays
	image_arr = np.full((largest_dim_0, largest_dim_1, largest_dim_2),-1024,dtype=np.float32)
	mask_arr = np.zeros((largest_dim_0, largest_dim_1, largest_dim_2),dtype=np.int16)
	# pad them and append
	image_arr[0:shape[0],0:shape[1],0:shape[2]] = image_nrrd[0]
	mask_arr[0:shape[0],0:shape[1],0:shape[2]] = mask_nrrd[0]
	X.append(image_arr)
	Y.append(mask_arr)
	print image, " shape is ", shape

	#
	# write h5
	#
	h5 = h5py.File(output, "w")
	# highest level heirarchy
	h5.create_dataset('X', dtype=np.float32, data=X)
	h5.create_dataset('Y', dtype=np.int16, data=Y)
	# close
	h5.close()

	#
	# read h5
	#
	h5 = h5py.File(output, "r")
	# convert into list of arrays + add a single channel
	obj = {'X':[],'Y':[]}
	obj['X']= list( np.expand_dims(np.array(h5['X']), axis=4).astype(np.float32) )
	obj['Y']= list( np.expand_dims(np.array(h5['Y']), axis=4).astype(np.int16) )
	print len(obj['X']), obj['X'][0].shape, len(obj['Y']), obj['Y'][0].shape
	# 2 (512, 512, 658, 1) 2 (512, 512, 658, 1)