Last active
June 3, 2016 06:55
-
-
Save jlln/fe8259c56636a59b62c1c102c5ff6141 to your computer and use it in GitHub Desktop.
Pandas/python function for determining the fractions of examples falling into different groups, taking into account other grouping criteria.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def groupCountFractionals(dataframe,target,outer): | |
''' | |
dataframe: a pandas dataframe | |
target: a string corresponding to the column of interest in the dataframe | |
outer: a list of the columns by which the counts should be conditioned | |
Returns the fraction of target_criteria_group / outer_criteria_group counts. | |
Be mindful to take group sizes (Outer Count) into consideration. | |
As outer count gets smaller, the fraction value | |
will obviously become increasingly large. | |
''' | |
dataframe["Count"] = 1 | |
outcome_variable = outer[0] | |
inner_group_criteria = outer + [target] | |
outer_counts = dataframe.groupby(outer).count()[["Count"]] | |
outer_counts.columns = ["OuterCount"] | |
outer_counts = outer_counts.reset_index() | |
inner_counts = dataframe.groupby(inner_group_criteria).count()[["Count"]] | |
inner_counts.columns = ["InnerCount"] | |
inner_counts = inner_counts.reset_index() | |
in_and_out =pandas.merge(inner_counts,outer_counts,on=outer,how="right") | |
in_and_out["Fraction"] = in_and_out["InnerCount"] / in_and_out["OuterCount"] | |
'''The following code is for tracking groups that have counts of zero''' | |
group_values = [list(dataframe[o].unique()) for o in outer] | |
inner_values = list(dataframe[target].unique()) | |
group_values.append(inner_values) | |
group_permutations = list(itertools.product(*group_values)) | |
null_df = pandas.DataFrame(group_permutations) | |
null_columns = outer.copy() | |
null_columns.append(target) | |
null_df.columns = null_columns | |
null_df = pandas.merge(null_df,outer_counts, on = outer,how="outer") | |
null_df["InnerCount"] = 0 | |
null_df["Fraction"] = 0 | |
absent_rows = pandas.merge(null_df,in_and_out,how="outer",on=null_columns,indicator="left_only") | |
zero_rows = null_df[absent_rows["left_only"] == "left_only"] | |
return pandas.concat([in_and_out,zero_rows]).fillna(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment