tashrifbillah · August 13, 2024 22:28
diff --git a/debug_psychs01.py b/debug_psychs01.py
 #!/usr/bin/env python

 # cd /data/predict1/to_nda/nda-submissions/network_combined

 df=pd.read_csv('/data/predict1/home/dm1447/data/ampscz_all_subjects.csv')
 df.set_index('subject_id',inplace=True)

 dfpre=pd.read_excel('form_status_tracker_PRESCIENT.xlsx')
 dfpre.set_index('subject',inplace=True)

 dfpsychs=pd.read_csv('original/ampscz_psychs01_baseline.csv',header=1,dtype=str)

 for i,row in dfpsychs.iterrows():
    s=row['src_subject_id']
    if df.loc[s,'network']=='PRESCIENT':
        print(s,dfpre.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)


        
 # how many of the original PRESCIENT subjects have empty psychs_screening columns
 count=0
 for i,row in dfpsychs.iterrows():
    s=row['src_subject_id']
    if df.loc[s,'network']=='PRESCIENT':
        if pd.isna(dfpre.loc[s,'psychs_p1p8_screening']) and pd.isna(dfpre.loc[s,'psychs_p9ac32_screening']):
            count+=1
            print(s,dfpre.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)

 print(count)


 # how many of the original ProNET subjects have empty psychs_screening columns
 dfpro=pd.read_excel('form_status_tracker_PRONET.xlsx')
 dfpro.set_index('subject',inplace=True)
 count=0
 for i,row in dfpsychs.iterrows():
    s=row['src_subject_id']
    if df.loc[s,'network']=='ProNET':
        if pd.isna(dfpro.loc[s,'psychs_p1p8_screening']) and pd.isna(dfpro.loc[s,'psychs_p9ac32_screening']):
            count+=1
            print(s,dfpro.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)

 print(count)



 # files are saved in /tmp/ of dn020
 tracker=pd.read_csv('/tmp/combined_tracker.csv')
 tracker.set_index('subject',inplace=True)

 for i,row in dfpsychs.iterrows():
    s=row['src_subject_id']
    if df.loc[s,'network']=='PRESCIENT':
        print(tracker.loc[s,'psychs_baseline'])
        
 # output shows that all original/ampscz_psychs01_baseline.csv PRESCIENT cases are "omit"
 # next diagnosis should be why all are omit







 # =======================================================================================
 # working code for loading p1p8 CSV files and generating one combined CSV
 dfcomb=None
 for f1 in p1p8_files:
    s=basename(f1).split('_')[0]
    df1=pd.read_csv(f1,dtype=str)
    
    # extract baseline
    _df1=df1.set_index('visit')

    try:
        __df1=_df1.loc['2']
    except KeyError:
        continue
    
    if _df1.loc[['2']].shape[0]>1:
        print(s)
        continue
        
    dict1={}
    dict1['src_subject_id']=s
    for v in vars:
        if v in df1.columns:
            dict1[v]=__df1[v]
    
    dfbase=pd.DataFrame.from_dict([dict1])
    if dfcomb is None:
        dfcomb=dfbase.copy()
    else:
        dfcomb=pd.concat([dfcomb,dfbase],axis=0,sort=False)        

 print(dfcomb.shape)


 # working code for loading JSONs and generating one combined CSV
 dfcomb2=None
 for s in dfcomb['src_subject_id'].values:
    f1=f'Prescient{s[:2]}/raw/{s}/surveys/{s}.Prescient.json'
    
    with open(f1) as f:
        data=json.load(f)
    
    dict1={}
    dict1['src_subject_id']=s
    for d in data:
        if d['redcap_event_name']=='baseline_arm_1':
            for v in vars:
                if v in dict1:
                    dict1[v]=d[v]
            break
            
    dfbase=pd.DataFrame.from_dict([dict1])
    if dfcomb2 is None:
        dfcomb2=dfbase.copy()
    else:
        dfcomb2=pd.concat([dfcomb2,dfbase],axis=0,sort=False)        

 print(dfcomb2.shape)


 # keep only common columns
 drop=[]
 for c in dfcomb2.columns:
    if c not in dfcomb.columns:
        drop.append(c)
        
 dfcomb2.drop(drop,axis=1,inplace=True)

 # issue with this approach is that HC follow up forms colluded into the combined df
	#!/usr/bin/env python

	# cd /data/predict1/to_nda/nda-submissions/network_combined

	df=pd.read_csv('/data/predict1/home/dm1447/data/ampscz_all_subjects.csv')
	df.set_index('subject_id',inplace=True)

	dfpre=pd.read_excel('form_status_tracker_PRESCIENT.xlsx')
	dfpre.set_index('subject',inplace=True)

	dfpsychs=pd.read_csv('original/ampscz_psychs01_baseline.csv',header=1,dtype=str)

	for i,row in dfpsychs.iterrows():
	s=row['src_subject_id']
	if df.loc[s,'network']=='PRESCIENT':
	print(s,dfpre.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)



	# how many of the original PRESCIENT subjects have empty psychs_screening columns
	count=0
	for i,row in dfpsychs.iterrows():
	s=row['src_subject_id']
	if df.loc[s,'network']=='PRESCIENT':
	if pd.isna(dfpre.loc[s,'psychs_p1p8_screening']) and pd.isna(dfpre.loc[s,'psychs_p9ac32_screening']):
	count+=1
	print(s,dfpre.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)

	print(count)


	# how many of the original ProNET subjects have empty psychs_screening columns
	dfpro=pd.read_excel('form_status_tracker_PRONET.xlsx')
	dfpro.set_index('subject',inplace=True)
	count=0
	for i,row in dfpsychs.iterrows():
	s=row['src_subject_id']
	if df.loc[s,'network']=='ProNET':
	if pd.isna(dfpro.loc[s,'psychs_p1p8_screening']) and pd.isna(dfpro.loc[s,'psychs_p9ac32_screening']):
	count+=1
	print(s,dfpro.loc[s,['psychs_p1p8_fu_baseline','psychs_p9ac32_fu_baseline']].values)

	print(count)



	# files are saved in /tmp/ of dn020
	tracker=pd.read_csv('/tmp/combined_tracker.csv')
	tracker.set_index('subject',inplace=True)

	for i,row in dfpsychs.iterrows():
	s=row['src_subject_id']
	if df.loc[s,'network']=='PRESCIENT':
	print(tracker.loc[s,'psychs_baseline'])

	# output shows that all original/ampscz_psychs01_baseline.csv PRESCIENT cases are "omit"
	# next diagnosis should be why all are omit







	# =======================================================================================
	# working code for loading p1p8 CSV files and generating one combined CSV
	dfcomb=None
	for f1 in p1p8_files:
	s=basename(f1).split('_')[0]
	df1=pd.read_csv(f1,dtype=str)

	# extract baseline
	_df1=df1.set_index('visit')

	try:
	__df1=_df1.loc['2']
	except KeyError:
	continue

	if _df1.loc[['2']].shape[0]>1:
	print(s)
	continue

	dict1={}
	dict1['src_subject_id']=s
	for v in vars:
	if v in df1.columns:
	dict1[v]=__df1[v]

	dfbase=pd.DataFrame.from_dict([dict1])
	if dfcomb is None:
	dfcomb=dfbase.copy()
	else:
	dfcomb=pd.concat([dfcomb,dfbase],axis=0,sort=False)

	print(dfcomb.shape)


	# working code for loading JSONs and generating one combined CSV
	dfcomb2=None
	for s in dfcomb['src_subject_id'].values:
	f1=f'Prescient{s[:2]}/raw/{s}/surveys/{s}.Prescient.json'

	with open(f1) as f:
	data=json.load(f)

	dict1={}
	dict1['src_subject_id']=s
	for d in data:
	if d['redcap_event_name']=='baseline_arm_1':
	for v in vars:
	if v in dict1:
	dict1[v]=d[v]
	break

	dfbase=pd.DataFrame.from_dict([dict1])
	if dfcomb2 is None:
	dfcomb2=dfbase.copy()
	else:
	dfcomb2=pd.concat([dfcomb2,dfbase],axis=0,sort=False)

	print(dfcomb2.shape)


	# keep only common columns
	drop=[]
	for c in dfcomb2.columns:
	if c not in dfcomb.columns:
	drop.append(c)

	dfcomb2.drop(drop,axis=1,inplace=True)

	# issue with this approach is that HC follow up forms colluded into the combined df