ant358 · February 24, 2019 18:34 · ant358 · Feb 24, 2019
diff --git a/anova_machine.py b/anova_machine.py
 def anova_machine(Cat_col, target_col, df):
    """ANOVA function.  Provide the target variable column y, the main data set and a categorical column.
    A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other.   
    Currently set for 95% confidence, will update later for higher significance setting."""
    
    p_table = df.pivot(columns=Cat_col, values=target_col)

    total_columns = len(p_table.columns)

    total_rows = len(p_table)

    sum_of_all_column_values = p_table.sum().sum()

    mean_of_all_column_values = p_table.mean().mean()

    sum_of_cols = p_table.sum()

    mean_of_cols = p_table.mean()
    
    df_between = total_columns - 1
    
    df_within = total_rows - df_between

    #  sumproduct(sum_of_all_column_values, mean_of_all_column_values)

    products = []
    for ind, col in sum_of_cols.items():
        x =mean_of_cols[ind] *col
        products.append(x)


    sum_product = 0
    for n in products:
        sum_product += n

    SS_between = sum_product - (sum_of_all_column_values**2 / total_rows)

    # SS_within = sum (devsq(col1), devsq(col2), etc  )
    SS_within = 0
    for ind, col in mean_of_cols.items():
        mean = col
        for row in p_table[ind]:
            if row > 0:
                SS_within +=(row - mean)**2

    MS_between = SS_between / df_between
    MS_within = SS_within / df_within

    F = MS_between / MS_within

    

    pvalue = stats.f.sf(F, df_between, df_within)
    return pvalue

 def anova_machine_part2(df, target, p_thres=0.05):
    """This function uses feeds anova machine part 1 above the columns
    of a data frame then sorts them by significance and outputs the result.
    It needs the dataframe df, the target variable (y_train) and it has 
    a p_threshold defaulted to 95% confidence that can be changed"""
    global sig_obj
    sig_obj = {}
    for col in df:
        if col != target:
            p = anova_machine(col, target, df)
            if pvalue <= p_thres:
                sig_obj[col] = pvalue

    sig_obj = pd.DataFrame.from_dict(sig_obj, orient='index')
    sig_obj = sig_obj[0].sort_values(ascending=True)
    
    return sig_obj
	def anova_machine(Cat_col, target_col, df):
	"""ANOVA function. Provide the target variable column y, the main data set and a categorical column.
	A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other.
	Currently set for 95% confidence, will update later for higher significance setting."""

	p_table = df.pivot(columns=Cat_col, values=target_col)

	total_columns = len(p_table.columns)

	total_rows = len(p_table)

	sum_of_all_column_values = p_table.sum().sum()

	mean_of_all_column_values = p_table.mean().mean()

	sum_of_cols = p_table.sum()

	mean_of_cols = p_table.mean()

	df_between = total_columns - 1

	df_within = total_rows - df_between

	# sumproduct(sum_of_all_column_values, mean_of_all_column_values)

	products = []
	for ind, col in sum_of_cols.items():
	x =mean_of_cols[ind] *col
	products.append(x)


	sum_product = 0
	for n in products:
	sum_product += n

	SS_between = sum_product - (sum_of_all_column_values**2 / total_rows)

	# SS_within = sum (devsq(col1), devsq(col2), etc )
	SS_within = 0
	for ind, col in mean_of_cols.items():
	mean = col
	for row in p_table[ind]:
	if row > 0:
	SS_within +=(row - mean)**2

	MS_between = SS_between / df_between
	MS_within = SS_within / df_within

	F = MS_between / MS_within



	pvalue = stats.f.sf(F, df_between, df_within)
	return pvalue

	def anova_machine_part2(df, target, p_thres=0.05):
	"""This function uses feeds anova machine part 1 above the columns
	of a data frame then sorts them by significance and outputs the result.
	It needs the dataframe df, the target variable (y_train) and it has
	a p_threshold defaulted to 95% confidence that can be changed"""
	global sig_obj
	sig_obj = {}
	for col in df:
	if col != target:
	p = anova_machine(col, target, df)
	if pvalue <= p_thres:
	sig_obj[col] = pvalue

	sig_obj = pd.DataFrame.from_dict(sig_obj, orient='index')
	sig_obj = sig_obj[0].sort_values(ascending=True)

	return sig_obj