Skip to content

Instantly share code, notes, and snippets.

@tashrifbillah
Last active January 22, 2022 22:37
Show Gist options
  • Save tashrifbillah/24efeec3219ba3c58c92adc419aac7be to your computer and use it in GitHub Desktop.
Save tashrifbillah/24efeec3219ba3c58c92adc419aac7be to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# cleaned up Kevin's work to make DPdash compatible files
# inserted codes for mtime: the latest time when a subject's data was downloaded
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import time, timedelta, datetime, date
flow_test_root = Path('/data/predict/kcho/flow_test')
pronet_phoenix_dir = flow_test_root / 'Pronet/PHOENIX'
prescient_phoenix_dir = flow_test_root / 'Prescient/PHOENIX'
outdir = flow_test_root/ 'ctime_experiment'
from os import stat
from os.path import isfile
from datetime import datetime
def _latest_mtime(p):
latest= -1
for file in p.rglob('*'):
if file.is_file():
mtime= file.stat().st_mtime
if mtime>latest:
latest= mtime
return datetime.fromtimestamp(latest).strftime('%Y-%m-%d')
def get_summary_from_phoenix(phoenix_dir: Path) -> pd.DataFrame:
'''Get summary from the PHOENIX structure'''
subject_paths = list(phoenix_dir.glob('*/*/*/*'))
df = pd.DataFrame({'p': subject_paths})
df['subject'] = df.p.apply(lambda x: x.name)
df['site'] = df.p.apply(lambda x: x.parent.parent.name)
df['level0'] = df.p.apply(lambda x: x.parent.parent.parent.name)
df['level1'] = df.p.apply(lambda x: x.parent.name)
df['surveys'] = df.p.apply(lambda x: len(list((x / 'surveys').glob('*json'))) > 0)
df['eeg'] = df.p.apply(lambda x: len(list((x / 'eeg').glob('*zip'))) > 0)
df['eeg_ss'] = df.p.apply(lambda x: (x / 'eeg' / f'{x.name}.Pronet.Run_sheet_eeg.csv').is_file())
df['actigraphy'] = df.p.apply(lambda x: len(list((x / 'actigraphy').glob('*zip'))) > 0)
df['actigraphy_ss'] = df.p.apply(lambda x: (x / 'actigraphy' / f'{x.name}.Pronet.Run_sheet_actigraphy.csv').is_file())
df['mri'] = df.p.apply(lambda x: len([x for x in (x / 'mri').glob('*') if x.is_dir()]) > 0)
df['mri_ss'] = df.p.apply(lambda x: (x / 'mri' / f'{x.name}.Pronet.Run_sheet_mri.csv').is_file())
df['interviews'] = df.p.apply(lambda x: len([x for x in (x / 'interviews').glob('*') if x.is_dir()]) > 0)
df['interviews_ss'] = df.p.apply(lambda x: (x / 'interviews' / f'{x.name}.Pronet.Run_sheet_interviews.csv').is_file())
df['mtime']= df.p.apply(lambda x: _latest_mtime(x))
return df
df = get_summary_from_phoenix(pronet_phoenix_dir)
df_pivot = pd.pivot_table(df, index=['subject', 'site', 'mtime'], columns=['level0', 'level1'], fill_value=False).astype(int)
for (subject, site, mtime), row in df_pivot.iterrows():
df_tmp = row.reset_index()
df_tmp.columns = ['datatype', 'level0', 'level1', 'count']
df_tmp_pivot = pd.pivot_table(df_tmp, columns=['datatype', 'level0', 'level1']).reset_index()
df_tmp_pivot['col'] = df_tmp_pivot['datatype'] + '_' + df_tmp_pivot['level1'] + '_' + df_tmp_pivot['level0']
subject_series_tmp = df_tmp_pivot.set_index('col')[0]
subject_series_tmp['mtime']= mtime
# https://gist.github.com/tashrifbillah/cea43521588adf127cae79353ae09968
# suggestion from Tashrif to link outputs to DPdash
subject_df_tmp = pd.DataFrame({
'day': [1],
'reftime': '',
'timeofday': '',
'weekday': ''
})
subject_df_tmp = pd.concat([subject_df_tmp, pd.DataFrame(subject_series_tmp).T], axis=1)
out_file_name = f"{site[-2:]}-{subject}-flowcheck-day1to1.csv"
subject_df_tmp.to_csv(outdir/out_file_name, index=False)
@tashrifbillah
Copy link
Author

tashrifbillah commented Jan 21, 2022

Hi @kcho, can you help with this error? I am also trying by the way.

(base) [tb571@eris2n4 flow_test]$ pwd
/data/predict/kcho/flow_test

(base) [tb571@eris2n4 flow_test]$ ./files_status_for_dpdash.py

Traceback (most recent call last):
  File "/data/predict/kcho/flow_test/./files_status_for_dpdash.py", line 51, in <module>
    df_tmp_pivot['col'] = df_tmp_pivot['datatype'] + '_' + df_tmp_pivot['level1'] + '_' + df_tmp_pivot['level0']
  File "/PHShome/tb571/miniconda3/lib/python3.9/site-packages/pandas/core/frame.py", line 3454, in __getitem__
    return self._getitem_multilevel(key)
  File "/PHShome/tb571/miniconda3/lib/python3.9/site-packages/pandas/core/frame.py", line 3505, in _getitem_multilevel
    loc = self.columns.get_loc(key)
  File "/PHShome/tb571/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py", line 2922, in get_loc
    loc = self._get_level_indexer(key, level=0)
  File "/PHShome/tb571/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py", line 3204, in _get_level_indexer
    idx = self._get_loc_single_level_index(level_index, key)
  File "/PHShome/tb571/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/multi.py", line 2855, in _get_loc_single_levelex
    return level_index.get_loc(key)
  File "/PHShome/tb571/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 3363, in get_loc
    raise KeyError(key) from err
KeyError: 'datatype'

@tashrifbillah
Copy link
Author

tashrifbillah commented Jan 21, 2022

This is what line 51 df_tmp_pivot looks like but it cannot find any of datatype, level0, or level1:

(Pdb) df_tmp_pivot['datatype']
*** KeyError: 'datatype'

(Pdb) df_tmp_pivot['level0']
*** KeyError: 'level0'

(Pdb) df_tmp_pivot.columns
MultiIndex([(        'index',          '',          ''),
            (   'actigraphy',   'GENERAL', 'processed'),
            (   'actigraphy', 'PROTECTED', 'processed'),
            (   'actigraphy', 'PROTECTED',       'raw'),
            ('actigraphy_ss',   'GENERAL', 'processed'),
            ('actigraphy_ss', 'PROTECTED', 'processed'),
            ('actigraphy_ss', 'PROTECTED',       'raw'),
            (          'eeg',   'GENERAL', 'processed'),
            (          'eeg', 'PROTECTED', 'processed'),
            (          'eeg', 'PROTECTED',       'raw'),
            (       'eeg_ss',   'GENERAL', 'processed'),
            (       'eeg_ss', 'PROTECTED', 'processed'),
            (       'eeg_ss', 'PROTECTED',       'raw'),
            (   'interviews',   'GENERAL', 'processed'),
            (   'interviews', 'PROTECTED', 'processed'),
            (   'interviews', 'PROTECTED',       'raw'),
            ('interviews_ss',   'GENERAL', 'processed'),
            ('interviews_ss', 'PROTECTED', 'processed'),
            ('interviews_ss', 'PROTECTED',       'raw'),
            (          'mri',   'GENERAL', 'processed'),
            (          'mri', 'PROTECTED', 'processed'),
            (          'mri', 'PROTECTED',       'raw'),
            (       'mri_ss',   'GENERAL', 'processed'),
            (       'mri_ss', 'PROTECTED', 'processed'),
            (       'mri_ss', 'PROTECTED',       'raw'),
            (      'surveys',   'GENERAL', 'processed'),
            (      'surveys', 'PROTECTED', 'processed'),
            (      'surveys', 'PROTECTED',       'raw')],
           names=['datatype', 'level0', 'level1'])

I suppose we need to access datatype, level0, level1 in a different way.

@tashrifbillah
Copy link
Author

The above error happened at Tashrif's latest Python+Pandas:

(base) [tb571@eris2n4 flow_test]$ ipython
Python 3.9.5 (default, Jun  4 2021, 12:28:51)

In [1]: import pandas as pd

In [2]: pd.__version__
Out[2]: '1.3.2'

But did not happen with Kevin's old one:

MicrosoftTeams-image

@tashrifbillah
Copy link
Author

tashrifbillah commented Jan 21, 2022

The problem I am having right now is making line 36 appear in the final csv files:

df['mtime']= df.p.apply(lambda x: '2021-01-21')

In reference to gist https://gist.github.com/tashrifbillah/24efeec3219ba3c58c92adc419aac7be

@kcho
Copy link

kcho commented Jan 22, 2022

if you want all the rows for the mtime column to have '2021-01-21', you can simply use df['mtime'] = '2021-01-21'

@tashrifbillah
Copy link
Author

🍔

@tashrifbillah
Copy link
Author

In reference to gist https://gist.github.com/tashrifbillah/24efeec3219ba3c58c92adc419aac7be
mtime has been made to appear in the final data frame through following modifications:

43 df_pivot = pd.pivot_table(df, index=['subject', 'site', 'mtime'], columns=['level0', 'level1'], fill_value=False).astype(int)
49 for (subject, site, mtime), row in df_pivot.iterrows():

And finally inserting after line 54:

subject_series_tmp['mtime']= mtime

@tashrifbillah
Copy link
Author

Updated the gist with _latest_mtime()
https://gist.github.com/tashrifbillah/24efeec3219ba3c58c92adc419aac7be

Now final data frame looks like the following

day,reftime,timeofday,weekday,actigraphy_processed_GENERAL,actigraphy_processed_PROTECTED,actigraphy_raw_PROTECTED,actigraphy_ss_processed_GENERAL,actigraphy_ss_processed_PROTECTED,actigraphy_ss_raw_PROTECTED,eeg_processed_GENERAL,eeg_processed_PROTECTED,eeg_raw_PROTECTED,eeg_ss_processed_GENERAL,eeg_ss_processed_PROTECTED,eeg_ss_raw_PROTECTED,interviews_processed_GENERAL,interviews_processed_PROTECTED,interviews_raw_PROTECTED,interviews_ss_processed_GENERAL,interviews_ss_processed_PROTECTED,interviews_ss_raw_PROTECTED,mri_processed_GENERAL,mri_processed_PROTECTED,mri_raw_PROTECTED,mri_ss_processed_GENERAL,mri_ss_processed_PROTECTED,mri_ss_raw_PROTECTED,surveys_processed_GENERAL,surveys_processed_PROTECTED,surveys_raw_PROTECTED,mtime
1,,,,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2021-12-21

@tashrifbillah
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment