snakers4 · March 16, 2017 09:53
diff --git a/console_python_corr_analysis b/console_python_corr_analysis
 # Unsophisticated corr analysis to deal w variable bias
 data_corr = sDf.corr()
 size = data_corr.shape[0] - 1 


 # Set the threshold to select only highly correlated attributes
 threshold = 0.5

 # List of pairs along with correlation above threshold
 corr_list = []

 #Search for the highly correlated pairs
 for i in range(0,size): #for 'size' features
    for j in range(i+1,size): #avoid repetition
        if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
            corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index

 #Sort to show higher ones first 
 s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

 cols = list(sDf.columns.values)
 corrDf = pd.DataFrame(columns=('A','B','corr'))
 #Print correlations and column names
 for v,i,j in s_corr_list:
    print ("%s and %s = %.2f" % (cols[i],cols[j],v))
    corrDf.loc[i] = [cols[i],cols[j],v ]
	# Unsophisticated corr analysis to deal w variable bias
	data_corr = sDf.corr()
	size = data_corr.shape[0] - 1


	# Set the threshold to select only highly correlated attributes
	threshold = 0.5

	# List of pairs along with correlation above threshold
	corr_list = []

	#Search for the highly correlated pairs
	for i in range(0,size): #for 'size' features
	for j in range(i+1,size): #avoid repetition
	if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
	corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index

	#Sort to show higher ones first
	s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

	cols = list(sDf.columns.values)
	corrDf = pd.DataFrame(columns=('A','B','corr'))
	#Print correlations and column names
	for v,i,j in s_corr_list:
	print ("%s and %s = %.2f" % (cols[i],cols[j],v))
	corrDf.loc[i] = [cols[i],cols[j],v ]