Last active
September 19, 2021 11:25
-
-
Save yatharthranjan/d63d8481337a0df5a168c7b45500a7ed to your computer and use it in GitHub Desktop.
Get top correlation pair in a very large number of variables in Pandas Dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_top_correlations_blog(df, threshold=0.4): | |
""" | |
df: the dataframe to get correlations from | |
threshold: the maximum and minimum value to include for correlations. For eg, if this is 0.4, only pairs haveing a correlation coefficient greater than 0.4 or less than -0.4 will be included in the results. | |
""" | |
orig_corr = df.corr() | |
c = orig_corr.abs() | |
so = c.unstack() | |
print("| Variable 1 | Variable 2 | Correlation Coefficient |") | |
print("|------------------|------------------|----------------------------|") | |
i=0 | |
pairs=set() | |
result = pd.DataFrame() | |
for index, value in so.sort_values(ascending=False).iteritems(): | |
# Exclude duplicates and self-correlations | |
if value > threshold \ | |
and index[0] != index[1] \ | |
and (index[0], index[1]) not in pairs \ | |
and (index[1], index[0]) not in pairs: | |
print(f'| {index[0]} | {index[1]} | {orig_corr.loc[(index[0], index[1])]} |') | |
result.loc[i, ['Variable 1', 'Variable 2', 'Correlation Coefficient']] = [index[0], index[1], orig_corr.loc[(index[0], index[1])]] | |
pairs.add((index[0], index[1])) | |
i+=1 | |
return result.reset_index(drop=True).set_index(['Variable 1', 'Variable 2']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment