Last active
June 9, 2021 01:12
-
-
Save wassname/a2cdf0b9b511f8a4769cbbe040a87900 to your computer and use it in GitHub Desktop.
dicoms2df.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| read dicom header and cache | |
| url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900 | |
| """ | |
| from diskcache import Cache, JSONDisk | |
| import pandas as pd | |
| from tqdm.contrib.concurrent import thread_map, process_map | |
| import logging | |
| from pathlib import Path | |
| from functools import partial | |
| from . dcm2json import header2json | |
| cache_dir = Path(f'data/.cache') | |
| cache = Cache(cache_dir/'dcm2dict.jsdb', disk=JSONDisk, disk_compress_level=6) | |
| @cache.memoize() | |
| def header2json_cache(f:str, px_summ=False): | |
| header = header2json(f) | |
| return header | |
| def h2j(f, px_summ=False): | |
| try: | |
| return header2json_cache(f, px_summ=px_summ) | |
| except Exception as e: | |
| logger.error(f'error header2json on {f}: {e}') | |
| def dicoms2df(files:list, px_summ = False, n_workers=None)->pd.DataFrame: | |
| """ | |
| Convert list of files to dataframe of headers. | |
| Usage: | |
| data_dir = Path('../data/raw/ProstateX/dicom/') | |
| fs = [str(p) for p in data_dir.glob('**/*.dcm')] | |
| df_dicom = dicoms2df(fs, n_workers=4) | |
| df_dicom | |
| Example output: | |
| | | SpecificCharacterSet | ImageType | ImageType1 |...| Modality | Manufacturer | SeriesDescription | st_mtime | | |
| |-----:|:-----------------------|:------------|:-------------|...|:-----------|:---------------|:------------------------------|-----------:| | |
| | 202 | ISO_IR 100 | ORIGINAL | PRIMARY |...| MR | SIEMENS | tfl_3d dynamisch fast | 1.6214e+09 | | |
| | 2494 | ISO_IR 100 | DERIVED | PRIMARY |...| MR | SIEMENS | ep2d_diff_tra_DYNDIST_MIX_ADC | 1.6214e+09 | | |
| | 1568 | ISO_IR 100 | ORIGINAL | PRIMARY |...| MR | SIEMENS | tfl_3d dynamisch fast | 1.6214e+09 | | |
| """ | |
| # h2j = partial(header2json_cache, px_summ=False) | |
| if n_workers==0: | |
| # sync for debug | |
| headers = [h2j(f) for f in tqdm(files)] | |
| else: | |
| headers = thread_map(h2j, files, max_workers=n_workers) | |
| df_dicom = pd.DataFrame.from_records(headers) | |
| df_dicom['Filename'] = files | |
| # df_dicom.to_csv(cache_file/f'df_{len(files)}_px{px_summ}.csv.gz', index=False) | |
| return df_dicom | |
| # and make a version cached to csv.gz | |
| import pickle | |
| import hashlib | |
| def md5hash(s: str) -> str: | |
| return hashlib.md5(s).hexdigest() | |
| def _pd_cache(func): | |
| def cache(*args, **kw): | |
| # The file name contains the hash of functions args and kwargs | |
| key = pickle.dumps(args, 1) + pickle.dumps(kw, 1) | |
| hsh = md5hash(key)[:6] | |
| f = cache_dir / f"{func.__name__}_{hsh}.csv.gz" | |
| if not f.exists(): | |
| df = func(*args, **kw) | |
| logging.debug(f'writing to {f}') | |
| df.to_csv(f, index=False) | |
| return pd.read_csv(f) | |
| return cache | |
| # cached version | |
| dicoms2df_cached = _pd_cache(dicoms2df) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| modified from https://github.com/fastai/fastai/blob/dded785401/fastai/medical/imaging.py#L363 | |
| url https://gist.github.com/wassname/a2cdf0b9b511f8a4769cbbe040a87900 | |
| to fail with logging then continue | |
| and to only read header (much faster) | |
| """ | |
| import pydicom | |
| from pydicom.dataset import Dataset as DcmDataset | |
| from pydicom.tag import BaseTag as DcmTag | |
| from pydicom.multival import MultiValue as DcmMultiValue | |
| import scipy.stats | |
| import logging | |
| import json | |
| import pandas as pd | |
| from pathlib import Path | |
| log = logging.getLogger('dicomheader') | |
| cache_dir = Path(f'.cache') | |
| # log.setLevel(logging.DEBUG) | |
| def _split_elem(k, v): | |
| """Split a dicom multivalue.""" | |
| if not isinstance(v, DcmMultiValue): | |
| yield k, v | |
| else: | |
| for i, o in enumerate(v): | |
| k2 = f'{k}{"" if i==0 else i}' | |
| yield k2, o | |
| def _cast_dicom_special(x): | |
| """Attempt to convert pydicom types to primitives.""" | |
| cls = type(x) | |
| if not cls.__module__.startswith('pydicom'): return x | |
| if cls.__base__ == object: return x | |
| if hasattr(x, '__len__') and len(x) == 0: return [] | |
| try: | |
| # sometimes this requires extract args | |
| return cls.__base__(x) | |
| except Exception as e: | |
| log.debug(f"Error while processing tag x={x}, e={e}") | |
| return str(cls) | |
| def as_dict(self: DcmDataset, px_summ:bool=False) -> dict: | |
| "Convert the header of a dicom into a dictionary" | |
| pxdata = (0x7FE0, 0x0010) | |
| res = {} | |
| for k in self.keys(): | |
| if k != pxdata: | |
| try: | |
| tag = self[k] | |
| value = tag.value | |
| keyword = tag.keyword | |
| for keyword, value in _split_elem(keyword, value): | |
| value = _cast_dicom_special(value) | |
| # if it not json serialisable, just take the string representation | |
| try: | |
| pd.io.json.dumps([keyword, value]) # if it can't be made into json, so ignore | |
| except: | |
| value = str(value) | |
| # crop huge tags, usually in private or unkown | |
| if len(str(value))>1000: | |
| value = str(value)[:1000] + '...' | |
| res[keyword] = value | |
| except Exception as e: | |
| log.debug(f"Error while processing tag k={k}, file={self.filename}, e={e}") | |
| if px_summ: | |
| # extract a range of stats | |
| # TODO use pmin, p05, p25, mode, p75, p95, pmax = np.quantile(x.flatten(), [0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]) | |
| stats = { | |
| 'min':np.min, | |
| 'max':np.max, | |
| 'mean':np.mean, | |
| 'std':np.std, | |
| 'dtype': lambda x:str(x.dtype), | |
| 'mode_bin': lambda x: np.bincount(x.flatten()).argmax(), # infer from most freq value | |
| 'mode': lambda x:scipy.stats.mode(x.flatten()).mode[0], | |
| 'median': np.median, | |
| 'count': lambda x: np.prod(x.shape), | |
| 'shape_0': lambda x:x.shape[0], | |
| 'shape_1': lambda x:x.shape[1], | |
| 'skew': lambda x:scipy.stats.skew(x.flatten()), | |
| 'p0.05': lambda x:np.quantile(x.flatten(), 0.05), | |
| 'p0.95': lambda x:np.quantile(x.flatten(), 0.95), | |
| 'p0.75': lambda x:np.quantile(x.flatten(), 0.75), | |
| 'p0.25': lambda x:np.quantile(x.flatten(), 0.25), | |
| 'p0.5': lambda x:np.quantile(x.flatten(), 0.5), | |
| } | |
| try: | |
| pxs = self.pixel_array | |
| for n, f in stats.items(): | |
| res['img_'+n] = f(pxs) | |
| except Exception as e: | |
| for f in stats: | |
| res['img_'+f] = 0 | |
| log.debug( | |
| f"Error while processing pixel data {self.filename}, e={e}" | |
| ) | |
| res = json.loads(pd.io.json.dumps(res)) # to primitives | |
| return res | |
| def header2json(f:str, px_summ=False, defer_size='64 KB', force=True): | |
| with pydicom.read_file(f, stop_before_pixels=not px_summ, force=force, defer_size=defer_size) as dcm: | |
| header = as_dict(dcm, px_summ=px_summ) | |
| header['path'] = str(f) | |
| s = Path(f).stat() | |
| header['st_size'] = s.st_size | |
| header['st_ctime'] = s.st_ctime | |
| header['st_mtime'] = s.st_mtime | |
| return header |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment