Skip to content

Instantly share code, notes, and snippets.

@yatharthranjan
Last active September 19, 2021 11:25
Show Gist options
  • Save yatharthranjan/d63d8481337a0df5a168c7b45500a7ed to your computer and use it in GitHub Desktop.
Save yatharthranjan/d63d8481337a0df5a168c7b45500a7ed to your computer and use it in GitHub Desktop.
Get top correlation pair in a very large number of variables in Pandas Dataframe
def get_top_correlations_blog(df, threshold=0.4):
"""
df: the dataframe to get correlations from
threshold: the maximum and minimum value to include for correlations. For eg, if this is 0.4, only pairs haveing a correlation coefficient greater than 0.4 or less than -0.4 will be included in the results.
"""
orig_corr = df.corr()
c = orig_corr.abs()
so = c.unstack()
print("| Variable 1 | Variable 2 | Correlation Coefficient |")
print("|------------------|------------------|----------------------------|")
i=0
pairs=set()
result = pd.DataFrame()
for index, value in so.sort_values(ascending=False).iteritems():
# Exclude duplicates and self-correlations
if value > threshold \
and index[0] != index[1] \
and (index[0], index[1]) not in pairs \
and (index[1], index[0]) not in pairs:
print(f'| {index[0]} | {index[1]} | {orig_corr.loc[(index[0], index[1])]} |')
result.loc[i, ['Variable 1', 'Variable 2', 'Correlation Coefficient']] = [index[0], index[1], orig_corr.loc[(index[0], index[1])]]
pairs.add((index[0], index[1]))
i+=1
return result.reset_index(drop=True).set_index(['Variable 1', 'Variable 2'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment