Skip to content

Instantly share code, notes, and snippets.

@tashrifbillah
Last active August 13, 2024 22:28
Show Gist options
  • Save tashrifbillah/557d5564c511e5f35e63ff93a5d4ad97 to your computer and use it in GitHub Desktop.
Save tashrifbillah/557d5564c511e5f35e63ff93a5d4ad97 to your computer and use it in GitHub Desktop.
Troubleshoot omission of all PRESCIENT cases from ampscz_psychs01_{baseline,month_1,month_2}
#!/usr/bin/env python
# cd /data/predict1/to_nda/nda-submissions/network_combined
df=pd.read_csv('/data/predict1/home/dm1447/data/ampscz_all_subjects.csv')
df.set_index('subject_id',inplace=True)
dfpre=pd.read_excel('form_status_tracker_PRESCIENT.xlsx')
dfpre.set_index('subject',inplace=True)
dfpsychs=pd.read_csv('original/ampscz_psychs01_baseline.csv',header=1,dtype=str)
for i,row in dfpsychs.iterrows():
s=row['src_subject_id']
if df.loc[s,'network']=='PRESCIENT':
print(s,dfpre.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)
# how many of the original PRESCIENT subjects have empty psychs_screening columns
count=0
for i,row in dfpsychs.iterrows():
s=row['src_subject_id']
if df.loc[s,'network']=='PRESCIENT':
if pd.isna(dfpre.loc[s,'psychs_p1p8_screening']) and pd.isna(dfpre.loc[s,'psychs_p9ac32_screening']):
count+=1
print(s,dfpre.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)
print(count)
# how many of the original ProNET subjects have empty psychs_screening columns
dfpro=pd.read_excel('form_status_tracker_PRONET.xlsx')
dfpro.set_index('subject',inplace=True)
count=0
for i,row in dfpsychs.iterrows():
s=row['src_subject_id']
if df.loc[s,'network']=='ProNET':
if pd.isna(dfpro.loc[s,'psychs_p1p8_screening']) and pd.isna(dfpro.loc[s,'psychs_p9ac32_screening']):
count+=1
print(s,dfpro.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)
print(count)
# files are saved in /tmp/ of dn020
tracker=pd.read_csv('/tmp/combined_tracker.csv')
tracker.set_index('subject',inplace=True)
for i,row in dfpsychs.iterrows():
s=row['src_subject_id']
if df.loc[s,'network']=='PRESCIENT':
print(tracker.loc[s,'psychs_baseline'])
# output shows that all original/ampscz_psychs01_baseline.csv PRESCIENT cases are "omit"
# next diagnosis should be why all are omit
# =======================================================================================
# working code for loading p1p8 CSV files and generating one combined CSV
dfcomb=None
for f1 in p1p8_files:
s=basename(f1).split('_')[0]
df1=pd.read_csv(f1,dtype=str)
# extract baseline
_df1=df1.set_index('visit')
try:
__df1=_df1.loc['2']
except KeyError:
continue
if _df1.loc[['2']].shape[0]>1:
print(s)
continue
dict1={}
dict1['src_subject_id']=s
for v in vars:
if v in df1.columns:
dict1[v]=__df1[v]
dfbase=pd.DataFrame.from_dict([dict1])
if dfcomb is None:
dfcomb=dfbase.copy()
else:
dfcomb=pd.concat([dfcomb,dfbase],axis=0,sort=False)
print(dfcomb.shape)
# working code for loading JSONs and generating one combined CSV
dfcomb2=None
for s in dfcomb['src_subject_id'].values:
f1=f'Prescient{s[:2]}/raw/{s}/surveys/{s}.Prescient.json'
with open(f1) as f:
data=json.load(f)
dict1={}
dict1['src_subject_id']=s
for d in data:
if d['redcap_event_name']=='baseline_arm_1':
for v in vars:
if v in dict1:
dict1[v]=d[v]
break
dfbase=pd.DataFrame.from_dict([dict1])
if dfcomb2 is None:
dfcomb2=dfbase.copy()
else:
dfcomb2=pd.concat([dfcomb2,dfbase],axis=0,sort=False)
print(dfcomb2.shape)
# keep only common columns
drop=[]
for c in dfcomb2.columns:
if c not in dfcomb.columns:
drop.append(c)
dfcomb2.drop(drop,axis=1,inplace=True)
# issue with this approach is that HC follow up forms colluded into the combined df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment