dylanjf · August 29, 2015 14:06
diff --git a/gistfile1.txt b/gistfile1.txt
 import numpy as np


 class CorrMatrix():
    """
    creates and displays the correlation matrix for a data set in a
    memory efficient manner.

    additionally, allows the option to enable feature selection, cutting off
    highly correlated features at a given threshold.

    of the two highly correlated features, the feature with the highest average
    correlation is removed.

    returns a list of variables which have been removed from the matrix.
    """
    def __init__(self, correlation_threshold=.9):
        self.correlation_threshold = correlation_threshold

    def select_features(self, feature_data_set):
        corr_mat = create_correlation_matrix(feature_data_set)
        print "Performing correlation based feature selection"

        #setting diagonals to 0
        most_correlated = []
        diags = corr_mat.shape[0]
        corr_mat[range(diags), range(diags)] = 0

        for i in xrange(corr_mat.shape[1]):
            if np.max(np.ma.masked_array(corr_mat[:, i], np.isnan(corr_mat[:, i]))) > self.correlation_threshold:
                j = corr_mat[:, i].argmax(axis=0)
                if np.mean(np.ma.masked_array(corr_mat[:, i], np.isnan(corr_mat[:, i]))) >= \
                        np.mean(np.ma.masked_array(corr_mat[:, j], np.isnan(corr_mat[:, j]))):
                    most_correlated.append(i)
                else:
                    most_correlated.append(j)

        corr_list = list(set(most_correlated))
        corr_list.sort()
        print "Highly Correlated Variables (consider removing):", corr_list
        non_correlated = [idx for idx in sorted(range(feature_data_set.shape[1])) if idx not in corr_list]
        return non_correlated


 #helper function
 def create_correlation_matrix(array):
    """
    creates correlation matrix iteratively as to be memory efficient
    """
    corr_matrix = np.eye(array.shape[1])
    for i in xrange(array.shape[1]):
        for j in xrange(i, array.shape[1]):
            if i != j:
                corr_matrix[i, j] = np.corrcoef(
                    array[:, i].T,
                    array[:, j].T)[0, 1]
                corr_matrix[j, i] = corr_matrix[i, j]
            else:
                corr_matrix[i, j] = 0.0
                corr_matrix[j, i] = 0.0
    return corr_matrix
	import numpy as np


	class CorrMatrix():
	"""
	creates and displays the correlation matrix for a data set in a
	memory efficient manner.

	additionally, allows the option to enable feature selection, cutting off
	highly correlated features at a given threshold.

	of the two highly correlated features, the feature with the highest average
	correlation is removed.

	returns a list of variables which have been removed from the matrix.
	"""
	def __init__(self, correlation_threshold=.9):
	self.correlation_threshold = correlation_threshold

	def select_features(self, feature_data_set):
	corr_mat = create_correlation_matrix(feature_data_set)
	print "Performing correlation based feature selection"

	#setting diagonals to 0
	most_correlated = []
	diags = corr_mat.shape[0]
	corr_mat[range(diags), range(diags)] = 0

	for i in xrange(corr_mat.shape[1]):
	if np.max(np.ma.masked_array(corr_mat[:, i], np.isnan(corr_mat[:, i]))) > self.correlation_threshold:
	j = corr_mat[:, i].argmax(axis=0)
	if np.mean(np.ma.masked_array(corr_mat[:, i], np.isnan(corr_mat[:, i]))) >= \
	np.mean(np.ma.masked_array(corr_mat[:, j], np.isnan(corr_mat[:, j]))):
	most_correlated.append(i)
	else:
	most_correlated.append(j)

	corr_list = list(set(most_correlated))
	corr_list.sort()
	print "Highly Correlated Variables (consider removing):", corr_list
	non_correlated = [idx for idx in sorted(range(feature_data_set.shape[1])) if idx not in corr_list]
	return non_correlated


	#helper function
	def create_correlation_matrix(array):
	"""
	creates correlation matrix iteratively as to be memory efficient
	"""
	corr_matrix = np.eye(array.shape[1])
	for i in xrange(array.shape[1]):
	for j in xrange(i, array.shape[1]):
	if i != j:
	corr_matrix[i, j] = np.corrcoef(
	array[:, i].T,
	array[:, j].T)[0, 1]
	corr_matrix[j, i] = corr_matrix[i, j]
	else:
	corr_matrix[i, j] = 0.0
	corr_matrix[j, i] = 0.0
	return corr_matrix