Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save arcaduf/860ae7d2fef2a8b4f9e806a65efd514b to your computer and use it in GitHub Desktop.
Save arcaduf/860ae7d2fef2a8b4f9e806a65efd514b to your computer and use it in GitHub Desktop.
Prepare Kaggle Ultrasound Nerve Segmentation dataset for DL
'''
Link image data to ground truth
'''
from __future__ import print_function
import glob , os
import pandas as pd
import numpy as np
# User input
path_in = '<path to>/ultrasound_data/train/'
path_out = '<output path>'
SEP = ','
# Collect all images and masks in given path
list_all = sorted( glob.glob( os.path.join( path_in , '*.tif' ) ) )
list_masks = sorted( glob.glob( os.path.join( path_in , '*mask*.tif' ) ) )
list_imgs = sorted( list( set( list_all ) - set( list_masks ) ) )
print( '\nFound ', len( list_imgs ),' images' )
print( 'Found ', len( list_masks ) , ' masks' )
# Collect IDs and image acquisition numbers from all images and masks
list_ids_imgs = []; list_nums_imgs = []
list_ids_masks = []; list_nums_masks = []
get_id = lambda file_name: np.int( os.path.basename( file_name ).split( '_' )[0] )
def get_num( file_name ):
if 'mask' in file_name:
return np.int( os.path.basename( file_name ).split( '_' )[1] )
else:
return np.int( os.path.basename( file_name ).split( '_' )[1].split( '.' )[0] )
for i in range( len( list_imgs ) ):
list_ids_imgs.append( get_id( list_imgs[i] ) )
list_nums_imgs.append( get_num( list_imgs[i] ) )
list_ids_masks.append( get_id( list_masks[i] ) )
list_nums_masks.append( get_num( list_masks[i] ) )
# Create a data frame for images and one for masks, then merge
df_imgs = pd.DataFrame( { 'id' : list_ids_imgs ,
'image' : list_imgs ,
'number' : list_nums_imgs } )
df_masks = pd.DataFrame( { 'id' : list_ids_masks ,
'mask' : list_masks ,
'number' : list_nums_masks } )
df = pd.merge( df_imgs , df_masks , on=[ 'id' , 'number' ] )
print( '\nMerge data frame shape: ', df.shape )
# Save merged data frame
file_out = os.path.join( path_out , 'master_index_imgs_and_masks.csv' )
df.to_csv( file_out , sep=SEP , index=False )
print( '\n\n' )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment