ivsanro1 · August 10, 2022 14:30
diff --git a/df_merge_fuzzy.py b/df_merge_fuzzy.py
 from thefuzz import fuzz
 from scipy.optimize import linear_sum_assignment
    
 def df_merge_fuzzy_optimal_with_cost(
    df_left: pd.DataFrame,
    df_right: pd.DataFrame,
    left_on: str,
    right_on: str,
    fn_cost: Optional[Callable] = lambda x, y: 100 - fuzz.ratio(x, y)
 ) -> pd.DataFrame:
    """
    Optimal bipartite graph matching where the nodes of each graph are the
    rows of `df_left` and `df_right` and the edges are the values given by
    `fn_cost` of the `df_left[left_on]` values and the `df_right[right_on]`
    values. This problem is also known as the Assignment problem.

    If len(df_left) != len(df_right), then it is an umbalanced assignment
    problem.

    For now, this function solves the umbalanced assignment by letting
    the edges from the bigger match with more than one edge of the
    smaller graph (pigeonhole principle).

    Currently, a transformation to a balanced assignment problem is not implemented,
    but it can be easily implemented by the naive reduction or doubling technique
    (see https://en.wikipedia.org/wiki/Assignment_problem#Unbalanced_assignment)
    """

    X = df_left[left_on].values
    Y = df_right[right_on].values
    M = np.zeros(shape=(len(X), len(Y)))

    for i in range(len(X)):
        for j in range(len(Y)):
            v = fn_cost(X[i], Y[j])
            M[i][j] = v

    corresp = linear_sum_assignment(M)

    corresp_x_idx_y_idx = [[corresp[0][idx], corresp[1][idx]] for idx in range(len(corresp[0]))]
    corresp_x_idx_y_idx_cost = [(x_idx, y_idx, M[x_idx][y_idx]) for x_idx, y_idx in corresp_x_idx_y_idx]

    df_corresp = pd.DataFrame(corresp_x_idx_y_idx_cost).rename(columns={
        0: 'idx_left',
        1: 'idx_right',
        2: 'cost',
    })

    df_merge = pd.merge(
        left=pd.merge(
                left=df_left.reset_index(drop=True).reset_index(),
                right=df_corresp,
                left_on='index',
                right_on='idx_left',
                how='outer'
            ),
        right=df_right.reset_index(drop=True).reset_index(),
        left_on='idx_right',
        right_on='index',
        how='outer'
    )

    return df_merge
	from thefuzz import fuzz
	from scipy.optimize import linear_sum_assignment

	def df_merge_fuzzy_optimal_with_cost(
	df_left: pd.DataFrame,
	df_right: pd.DataFrame,
	left_on: str,
	right_on: str,
	fn_cost: Optional[Callable] = lambda x, y: 100 - fuzz.ratio(x, y)
	) -> pd.DataFrame:
	"""
	Optimal bipartite graph matching where the nodes of each graph are the
	rows of `df_left` and `df_right` and the edges are the values given by
	`fn_cost` of the `df_left[left_on]` values and the `df_right[right_on]`
	values. This problem is also known as the Assignment problem.

	If len(df_left) != len(df_right), then it is an umbalanced assignment
	problem.

	For now, this function solves the umbalanced assignment by letting
	the edges from the bigger match with more than one edge of the
	smaller graph (pigeonhole principle).

	Currently, a transformation to a balanced assignment problem is not implemented,
	but it can be easily implemented by the naive reduction or doubling technique
	(see https://en.wikipedia.org/wiki/Assignment_problem#Unbalanced_assignment)
	"""

	X = df_left[left_on].values
	Y = df_right[right_on].values
	M = np.zeros(shape=(len(X), len(Y)))

	for i in range(len(X)):
	for j in range(len(Y)):
	v = fn_cost(X[i], Y[j])
	M[i][j] = v

	corresp = linear_sum_assignment(M)

	corresp_x_idx_y_idx = [[corresp[0][idx], corresp[1][idx]] for idx in range(len(corresp[0]))]
	corresp_x_idx_y_idx_cost = [(x_idx, y_idx, M[x_idx][y_idx]) for x_idx, y_idx in corresp_x_idx_y_idx]

	df_corresp = pd.DataFrame(corresp_x_idx_y_idx_cost).rename(columns={
	0: 'idx_left',
	1: 'idx_right',
	2: 'cost',
	})

	df_merge = pd.merge(
	left=pd.merge(
	left=df_left.reset_index(drop=True).reset_index(),
	right=df_corresp,
	left_on='index',
	right_on='idx_left',
	how='outer'
	),
	right=df_right.reset_index(drop=True).reset_index(),
	left_on='idx_right',
	right_on='index',
	how='outer'
	)

	return df_merge
No results found