Last active
February 24, 2019 18:34
-
-
Save ant358/7f8335ae3cbe00a97fc840a86f0ab8a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def anova_machine(Cat_col, target_col, df): | |
"""ANOVA function. Provide the target variable column y, the main data set and a categorical column. | |
A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other. | |
Currently set for 95% confidence, will update later for higher significance setting.""" | |
p_table = df.pivot(columns=Cat_col, values=target_col) | |
total_columns = len(p_table.columns) | |
total_rows = len(p_table) | |
sum_of_all_column_values = p_table.sum().sum() | |
mean_of_all_column_values = p_table.mean().mean() | |
sum_of_cols = p_table.sum() | |
mean_of_cols = p_table.mean() | |
df_between = total_columns - 1 | |
df_within = total_rows - df_between | |
# sumproduct(sum_of_all_column_values, mean_of_all_column_values) | |
products = [] | |
for ind, col in sum_of_cols.items(): | |
x =mean_of_cols[ind] *col | |
products.append(x) | |
sum_product = 0 | |
for n in products: | |
sum_product += n | |
SS_between = sum_product - (sum_of_all_column_values**2 / total_rows) | |
# SS_within = sum (devsq(col1), devsq(col2), etc ) | |
SS_within = 0 | |
for ind, col in mean_of_cols.items(): | |
mean = col | |
for row in p_table[ind]: | |
if row > 0: | |
SS_within +=(row - mean)**2 | |
MS_between = SS_between / df_between | |
MS_within = SS_within / df_within | |
F = MS_between / MS_within | |
pvalue = stats.f.sf(F, df_between, df_within) | |
return pvalue | |
def anova_machine_part2(df, target, p_thres=0.05): | |
"""This function uses feeds anova machine part 1 above the columns | |
of a data frame then sorts them by significance and outputs the result. | |
It needs the dataframe df, the target variable (y_train) and it has | |
a p_threshold defaulted to 95% confidence that can be changed""" | |
global sig_obj | |
sig_obj = {} | |
for col in df: | |
if col != target: | |
p = anova_machine(col, target, df) | |
if pvalue <= p_thres: | |
sig_obj[col] = pvalue | |
sig_obj = pd.DataFrame.from_dict(sig_obj, orient='index') | |
sig_obj = sig_obj[0].sort_values(ascending=True) | |
return sig_obj |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ANOVA function. Provide the target variable column y, the main data set and a categorical column.
A pivot table will be produced. Then an ANOVA performed to see if the columns are significantly different from each other.
Currently set for 95% confidence, will update later for higher significance setting.