Last active
May 25, 2017 16:35
-
-
Save MikeDacre/bce7c7a4fd0d9457db47a3c199c2a45a to your computer and use it in GitHub Desktop.
A python 3 function to make a pandas DataFrame unique on multiple columns while preserving other columns and filtering by multiple columns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def run_filter(df, group_cols: list, merge_col: str, sort_col: str, cutoff: float = 0.6): | |
"""Make a DataFrame unique on group_cols based on boolean merge_col and float sort_col. | |
Picks most frequent result of boolean merge_col if the result represents a portion of the | |
total greater than cutoff. Then sorts the results ascending by sort_col and returns the | |
first row only. | |
If filter fails (i.e. top hit is less than cutoff of total) then the entire group is dropped. | |
Parameters | |
---------- | |
df: pandas.core.frame.DataFrame | |
group_cols: list_of_str | |
A list of the column names to group by | |
merge_col: str | |
The name of the boolean column to filter | |
sort_col: str | |
The name of the column to sort by when picking top hit | |
Returns | |
------- | |
df: pandas.core.frame.DataFrame | |
DataFrame with the same columns as the starting df, but with rows unique on group_cols. | |
""" | |
def filter_group(d): | |
"""Use with groupby().apply(), filter DataFrame by boolean column and make unique. | |
Will sort ascending by sort_col and return the first result. | |
""" | |
match_info = d[merge_col].value_counts().to_dict() | |
# Pick best match result if it represents more than 60% | |
if len(match_info) == 1: | |
match_result = list(match_info.keys())[0] | |
else: | |
if (max(match_info[True], match_info[False])/ | |
(match_info[True]+match_info[False])) < cutoff: | |
match_result = None | |
else: | |
if match_info[True] > match_info[False]: | |
match_result = True | |
else: | |
match_result = False | |
if match_result is None: | |
# Drop all somehow, e.g. | |
return pd.DataFrame() | |
new_d = d[d[merge_col] == match_result] | |
new_d.sort_values(sort_col, ascending=True, inplace=True) | |
# Return the one row with the lowest p as a dataframe | |
return new_d.iloc[0:1] | |
cols = df.columns | |
res = df.groupby(group_cols).apply(filter_group) | |
return res.reset_index(level=0, drop=True).reset_index(level=0, drop=True)[cols] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment