Created
March 16, 2017 09:53
-
-
Save snakers4/bd0c9ce8bc823159907be6f28ef2676f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Unsophisticated corr analysis to deal w variable bias | |
data_corr = sDf.corr() | |
size = data_corr.shape[0] - 1 | |
# Set the threshold to select only highly correlated attributes | |
threshold = 0.5 | |
# List of pairs along with correlation above threshold | |
corr_list = [] | |
#Search for the highly correlated pairs | |
for i in range(0,size): #for 'size' features | |
for j in range(i+1,size): #avoid repetition | |
if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold): | |
corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index | |
#Sort to show higher ones first | |
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0])) | |
cols = list(sDf.columns.values) | |
corrDf = pd.DataFrame(columns=('A','B','corr')) | |
#Print correlations and column names | |
for v,i,j in s_corr_list: | |
print ("%s and %s = %.2f" % (cols[i],cols[j],v)) | |
corrDf.loc[i] = [cols[i],cols[j],v ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment