wassname · June 9, 2021 01:12
diff --git a/dcm2df.py b/dcm2df.py
 """
 read dicom header and cache
 url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900
 """
 from diskcache import Cache, JSONDisk
 import pandas as pd
 from tqdm.contrib.concurrent import thread_map, process_map
 import logging
 from pathlib import Path
 from functools import partial
 from . dcm2json import header2json

 cache_dir = Path(f'data/.cache')

 cache = Cache(cache_dir/'dcm2dict.jsdb', disk=JSONDisk, disk_compress_level=6)
 @cache.memoize()
 def header2json_cache(f:str, px_summ=False):
    header = header2json(f)
    return header


 def h2j(f, px_summ=False):
    try:
        return header2json_cache(f, px_summ=px_summ)
    except Exception as e:
        logger.error(f'error header2json on {f}: {e}')

 def dicoms2df(files:list, px_summ = False, n_workers=None)->pd.DataFrame:
    """
    Convert list of files to dataframe of headers.
    
    Usage:
        data_dir = Path('../data/raw/ProstateX/dicom/')
        fs = [str(p) for p in data_dir.glob('**/*.dcm')]
        df_dicom = dicoms2df(fs, n_workers=4)
        df_dicom
        
    Example output:
        |      | SpecificCharacterSet   | ImageType   | ImageType1   |...| Modality   | Manufacturer   | SeriesDescription             |   st_mtime |
        |-----:|:-----------------------|:------------|:-------------|...|:-----------|:---------------|:------------------------------|-----------:|
        |  202 | ISO_IR 100             | ORIGINAL    | PRIMARY      |...| MR         | SIEMENS        | tfl_3d dynamisch fast         | 1.6214e+09 |
        | 2494 | ISO_IR 100             | DERIVED     | PRIMARY      |...| MR         | SIEMENS        | ep2d_diff_tra_DYNDIST_MIX_ADC | 1.6214e+09 |
        | 1568 | ISO_IR 100             | ORIGINAL    | PRIMARY      |...| MR         | SIEMENS        | tfl_3d dynamisch fast         | 1.6214e+09 |
    """
    # h2j = partial(header2json_cache, px_summ=False)
    if n_workers==0:
        # sync for debug
        headers = [h2j(f) for f in tqdm(files)]
    else:
        headers = thread_map(h2j, files, max_workers=n_workers)
    df_dicom = pd.DataFrame.from_records(headers)
    df_dicom['Filename'] = files
 #     df_dicom.to_csv(cache_file/f'df_{len(files)}_px{px_summ}.csv.gz', index=False)
    return df_dicom

 # and make a version cached to csv.gz
 import pickle
 import hashlib

 def md5hash(s: str) -> str:
    return hashlib.md5(s).hexdigest()


 def _pd_cache(func):
    def cache(*args, **kw):
        # The file name contains the hash of functions args and kwargs
        key = pickle.dumps(args, 1) + pickle.dumps(kw, 1)
        hsh = md5hash(key)[:6]
        f = cache_dir / f"{func.__name__}_{hsh}.csv.gz"

        if not f.exists():
            df = func(*args, **kw)
            logging.debug(f'writing to {f}')
            df.to_csv(f, index=False)
        return pd.read_csv(f)

    return cache

 # cached version
 dicoms2df_cached = _pd_cache(dicoms2df)
diff --git a/dcm2json.py b/dcm2json.py
 """
 modified from https://github.com/fastai/fastai/blob/dded785401/fastai/medical/imaging.py#L363
 url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900
 to fail with logging then continue
 and to only read header (much faster)
 """
 import pydicom
 from pydicom.dataset import Dataset as DcmDataset
 from pydicom.tag import BaseTag as DcmTag
 from pydicom.multival import MultiValue as DcmMultiValue
 import scipy.stats
 import logging
 import json
 import pandas as pd
 from pathlib import Path

 log = logging.getLogger('dicomheader')
 cache_dir = Path(f'.cache')
 # log.setLevel(logging.DEBUG)


 def _split_elem(k, v):
    """Split a dicom multivalue."""
    if not isinstance(v, DcmMultiValue):
        yield k, v
    else:        
        for i, o in enumerate(v):
            k2 = f'{k}{"" if i==0 else i}'
            yield k2, o

 def _cast_dicom_special(x):
    """Attempt to convert pydicom types to primitives."""
    cls = type(x)
    if not cls.__module__.startswith('pydicom'): return x
    if cls.__base__ == object: return x
    if hasattr(x, '__len__') and len(x) == 0: return []
    try:
        # sometimes this requires extract args
        return cls.__base__(x)
    except Exception as e:
        log.debug(f"Error while processing tag x={x}, e={e}")
        return str(cls)

 def as_dict(self: DcmDataset, px_summ:bool=False) -> dict:
    "Convert the header of a dicom into a dictionary"
    pxdata = (0x7FE0, 0x0010)
    res = {}
    for k in self.keys():
        if k != pxdata:
            try:
                tag = self[k]
                value = tag.value
                keyword = tag.keyword
                for keyword, value in _split_elem(keyword, value):
                    value = _cast_dicom_special(value)

                    # if it not json serialisable, just take the string representation
                    try:
                        pd.io.json.dumps([keyword, value]) # if it can't be made into json, so ignore
                    except:
                        value = str(value)

                    # crop huge tags, usually in private or unkown
                    if len(str(value))>1000:
                        value = str(value)[:1000] + '...'
                    
                    res[keyword] = value
            except Exception as e:
                log.debug(f"Error while processing tag k={k}, file={self.filename}, e={e}")

    if px_summ:
        # extract a range of stats
        # TODO use pmin, p05, p25, mode, p75, p95, pmax = np.quantile(x.flatten(), [0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
        stats = {
            'min':np.min,
            'max':np.max,
            'mean':np.mean,
            'std':np.std,  
            'dtype': lambda x:str(x.dtype),
            'mode_bin': lambda x: np.bincount(x.flatten()).argmax(), # infer from most freq value
            'mode': lambda x:scipy.stats.mode(x.flatten()).mode[0],
            'median': np.median,
            'count': lambda x: np.prod(x.shape),
            'shape_0': lambda x:x.shape[0],
            'shape_1': lambda x:x.shape[1],
            'skew': lambda x:scipy.stats.skew(x.flatten()),
            'p0.05': lambda x:np.quantile(x.flatten(), 0.05),
            'p0.95': lambda x:np.quantile(x.flatten(), 0.95),
            'p0.75': lambda x:np.quantile(x.flatten(), 0.75),
            'p0.25': lambda x:np.quantile(x.flatten(), 0.25),
            'p0.5': lambda x:np.quantile(x.flatten(), 0.5),
        }
        try:
            pxs = self.pixel_array
            for n, f in stats.items(): 
                res['img_'+n] = f(pxs)
        except Exception as e:
            for f in stats: 
                res['img_'+f] = 0
            log.debug(
                f"Error while processing pixel data {self.filename}, e={e}"
            )

    res = json.loads(pd.io.json.dumps(res)) # to primitives
    return res

 def header2json(f:str, px_summ=False, defer_size='64 KB', force=True):
    with pydicom.read_file(f, stop_before_pixels=not px_summ, force=force, defer_size=defer_size) as dcm:
        header = as_dict(dcm, px_summ=px_summ)
    
    header['path'] = str(f)

    s = Path(f).stat()
    header['st_size'] = s.st_size
    header['st_ctime'] = s.st_ctime
    header['st_mtime'] = s.st_mtime
    return header
	"""
	read dicom header and cache
	url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900
	"""
	from diskcache import Cache, JSONDisk
	import pandas as pd
	from tqdm.contrib.concurrent import thread_map, process_map
	import logging
	from pathlib import Path
	from functools import partial
	from . dcm2json import header2json

	cache_dir = Path(f'data/.cache')

	cache = Cache(cache_dir/'dcm2dict.jsdb', disk=JSONDisk, disk_compress_level=6)
	@cache.memoize()
	def header2json_cache(f:str, px_summ=False):
	header = header2json(f)
	return header


	def h2j(f, px_summ=False):
	try:
	return header2json_cache(f, px_summ=px_summ)
	except Exception as e:
	logger.error(f'error header2json on {f}: {e}')

	def dicoms2df(files:list, px_summ = False, n_workers=None)->pd.DataFrame:
	"""
	Convert list of files to dataframe of headers.

	Usage:
	data_dir = Path('../data/raw/ProstateX/dicom/')
	fs = [str(p) for p in data_dir.glob('*/.dcm')]
	df_dicom = dicoms2df(fs, n_workers=4)
	df_dicom

	Example output:
	\| \| SpecificCharacterSet \| ImageType \| ImageType1 \|...\| Modality \| Manufacturer \| SeriesDescription \| st_mtime \|
	\|-----:\|:-----------------------\|:------------\|:-------------\|...\|:-----------\|:---------------\|:------------------------------\|-----------:\|
	\| 202 \| ISO_IR 100 \| ORIGINAL \| PRIMARY \|...\| MR \| SIEMENS \| tfl_3d dynamisch fast \| 1.6214e+09 \|
	\| 2494 \| ISO_IR 100 \| DERIVED \| PRIMARY \|...\| MR \| SIEMENS \| ep2d_diff_tra_DYNDIST_MIX_ADC \| 1.6214e+09 \|
	\| 1568 \| ISO_IR 100 \| ORIGINAL \| PRIMARY \|...\| MR \| SIEMENS \| tfl_3d dynamisch fast \| 1.6214e+09 \|
	"""
	# h2j = partial(header2json_cache, px_summ=False)
	if n_workers==0:
	# sync for debug
	headers = [h2j(f) for f in tqdm(files)]
	else:
	headers = thread_map(h2j, files, max_workers=n_workers)
	df_dicom = pd.DataFrame.from_records(headers)
	df_dicom['Filename'] = files
	# df_dicom.to_csv(cache_file/f'df_{len(files)}_px{px_summ}.csv.gz', index=False)
	return df_dicom

	# and make a version cached to csv.gz
	import pickle
	import hashlib

	def md5hash(s: str) -> str:
	return hashlib.md5(s).hexdigest()


	def _pd_cache(func):
	def cache(args, *kw):
	# The file name contains the hash of functions args and kwargs
	key = pickle.dumps(args, 1) + pickle.dumps(kw, 1)
	hsh = md5hash(key)[:6]
	f = cache_dir / f"{func.__name__}_{hsh}.csv.gz"

	if not f.exists():
	df = func(args, *kw)
	logging.debug(f'writing to {f}')
	df.to_csv(f, index=False)
	return pd.read_csv(f)

	return cache

	# cached version
	dicoms2df_cached = _pd_cache(dicoms2df)
	"""
	modified from https://github.com/fastai/fastai/blob/dded785401/fastai/medical/imaging.py#L363
	url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900
	to fail with logging then continue
	and to only read header (much faster)
	"""
	import pydicom
	from pydicom.dataset import Dataset as DcmDataset
	from pydicom.tag import BaseTag as DcmTag
	from pydicom.multival import MultiValue as DcmMultiValue
	import scipy.stats
	import logging
	import json
	import pandas as pd
	from pathlib import Path

	log = logging.getLogger('dicomheader')
	cache_dir = Path(f'.cache')
	# log.setLevel(logging.DEBUG)


	def _split_elem(k, v):
	"""Split a dicom multivalue."""
	if not isinstance(v, DcmMultiValue):
	yield k, v
	else:
	for i, o in enumerate(v):
	k2 = f'{k}{"" if i==0 else i}'
	yield k2, o

	def _cast_dicom_special(x):
	"""Attempt to convert pydicom types to primitives."""
	cls = type(x)
	if not cls.__module__.startswith('pydicom'): return x
	if cls.__base__ == object: return x
	if hasattr(x, '__len__') and len(x) == 0: return []
	try:
	# sometimes this requires extract args
	return cls.__base__(x)
	except Exception as e:
	log.debug(f"Error while processing tag x={x}, e={e}")
	return str(cls)

	def as_dict(self: DcmDataset, px_summ:bool=False) -> dict:
	"Convert the header of a dicom into a dictionary"
	pxdata = (0x7FE0, 0x0010)
	res = {}
	for k in self.keys():
	if k != pxdata:
	try:
	tag = self[k]
	value = tag.value
	keyword = tag.keyword
	for keyword, value in _split_elem(keyword, value):
	value = _cast_dicom_special(value)

	# if it not json serialisable, just take the string representation
	try:
	pd.io.json.dumps([keyword, value]) # if it can't be made into json, so ignore
	except:
	value = str(value)

	# crop huge tags, usually in private or unkown
	if len(str(value))>1000:
	value = str(value)[:1000] + '...'

	res[keyword] = value
	except Exception as e:
	log.debug(f"Error while processing tag k={k}, file={self.filename}, e={e}")

	if px_summ:
	# extract a range of stats
	# TODO use pmin, p05, p25, mode, p75, p95, pmax = np.quantile(x.flatten(), [0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0])
	stats = {
	'min':np.min,
	'max':np.max,
	'mean':np.mean,
	'std':np.std,
	'dtype': lambda x:str(x.dtype),
	'mode_bin': lambda x: np.bincount(x.flatten()).argmax(), # infer from most freq value
	'mode': lambda x:scipy.stats.mode(x.flatten()).mode[0],
	'median': np.median,
	'count': lambda x: np.prod(x.shape),
	'shape_0': lambda x:x.shape[0],
	'shape_1': lambda x:x.shape[1],
	'skew': lambda x:scipy.stats.skew(x.flatten()),
	'p0.05': lambda x:np.quantile(x.flatten(), 0.05),
	'p0.95': lambda x:np.quantile(x.flatten(), 0.95),
	'p0.75': lambda x:np.quantile(x.flatten(), 0.75),
	'p0.25': lambda x:np.quantile(x.flatten(), 0.25),
	'p0.5': lambda x:np.quantile(x.flatten(), 0.5),
	}
	try:
	pxs = self.pixel_array
	for n, f in stats.items():
	res['img_'+n] = f(pxs)
	except Exception as e:
	for f in stats:
	res['img_'+f] = 0
	log.debug(
	f"Error while processing pixel data {self.filename}, e={e}"
	)

	res = json.loads(pd.io.json.dumps(res)) # to primitives
	return res

	def header2json(f:str, px_summ=False, defer_size='64 KB', force=True):
	with pydicom.read_file(f, stop_before_pixels=not px_summ, force=force, defer_size=defer_size) as dcm:
	header = as_dict(dcm, px_summ=px_summ)

	header['path'] = str(f)

	s = Path(f).stat()
	header['st_size'] = s.st_size
	header['st_ctime'] = s.st_ctime
	header['st_mtime'] = s.st_mtime
	return header