Last active
August 10, 2022 14:30
-
-
Save ivsanro1/c1d3939027d8e3d278b56290d3817c29 to your computer and use it in GitHub Desktop.
Optimally merge dataframes with cost function (e.g. fuzzy merge)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from thefuzz import fuzz | |
| from scipy.optimize import linear_sum_assignment | |
| def df_merge_fuzzy_optimal_with_cost( | |
| df_left: pd.DataFrame, | |
| df_right: pd.DataFrame, | |
| left_on: str, | |
| right_on: str, | |
| fn_cost: Optional[Callable] = lambda x, y: 100 - fuzz.ratio(x, y) | |
| ) -> pd.DataFrame: | |
| """ | |
| Optimal bipartite graph matching where the nodes of each graph are the | |
| rows of `df_left` and `df_right` and the edges are the values given by | |
| `fn_cost` of the `df_left[left_on]` values and the `df_right[right_on]` | |
| values. This problem is also known as the Assignment problem. | |
| If len(df_left) != len(df_right), then it is an umbalanced assignment | |
| problem. | |
| For now, this function solves the umbalanced assignment by letting | |
| the edges from the bigger match with more than one edge of the | |
| smaller graph (pigeonhole principle). | |
| Currently, a transformation to a balanced assignment problem is not implemented, | |
| but it can be easily implemented by the naive reduction or doubling technique | |
| (see https://en.wikipedia.org/wiki/Assignment_problem#Unbalanced_assignment) | |
| """ | |
| X = df_left[left_on].values | |
| Y = df_right[right_on].values | |
| M = np.zeros(shape=(len(X), len(Y))) | |
| for i in range(len(X)): | |
| for j in range(len(Y)): | |
| v = fn_cost(X[i], Y[j]) | |
| M[i][j] = v | |
| corresp = linear_sum_assignment(M) | |
| corresp_x_idx_y_idx = [[corresp[0][idx], corresp[1][idx]] for idx in range(len(corresp[0]))] | |
| corresp_x_idx_y_idx_cost = [(x_idx, y_idx, M[x_idx][y_idx]) for x_idx, y_idx in corresp_x_idx_y_idx] | |
| df_corresp = pd.DataFrame(corresp_x_idx_y_idx_cost).rename(columns={ | |
| 0: 'idx_left', | |
| 1: 'idx_right', | |
| 2: 'cost', | |
| }) | |
| df_merge = pd.merge( | |
| left=pd.merge( | |
| left=df_left.reset_index(drop=True).reset_index(), | |
| right=df_corresp, | |
| left_on='index', | |
| right_on='idx_left', | |
| how='outer' | |
| ), | |
| right=df_right.reset_index(drop=True).reset_index(), | |
| left_on='idx_right', | |
| right_on='index', | |
| how='outer' | |
| ) | |
| return df_merge |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment