cameres · November 22, 2022 14:19 · juhotuho10 · Jun 23, 2022
diff --git a/compute_correlation_matrix.py b/compute_correlation_matrix.py
 from pyspark.mllib.stat import Statistics
 import pandas as pd

 # result can be used w/ seaborn's heatmap
 def compute_correlation_matrix(df, method='pearson'):
    # wrapper around
    # https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
    df_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(df_rdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat,
                    columns=df.columns, 
                    index=df.columns)
    return corr_mat_df
	from pyspark.mllib.stat import Statistics
	import pandas as pd

	# result can be used w/ seaborn's heatmap
	def compute_correlation_matrix(df, method='pearson'):
	# wrapper around
	# https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
	df_rdd = df.rdd.map(lambda row: row[0:])
	corr_mat = Statistics.corr(df_rdd, method=method)
	corr_mat_df = pd.DataFrame(corr_mat,
	columns=df.columns,
	index=df.columns)
	return corr_mat_df