Created
March 19, 2025 14:56
-
-
Save tommylees112/3cd536dc72b4fc43f7d3a87e69b269c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import pandas as pd | |
from loguru import logger | |
from rapidfuzz import process | |
DOWNLOADS_DIR = Path.home() / "Downloads" | |
def get_closest_match( | |
left_df: pd.DataFrame, right_df: pd.DataFrame, left_column: str, right_column: str | |
) -> pd.DataFrame: | |
""" | |
Find the best match for each value in the specified column of `left_df` | |
from the `column` in `right_df` using rapidfuzz for better performance. | |
""" | |
right_values = right_df[right_column].tolist() | |
# Extract only the first two elements (match, score, ~index) from the tuple | |
results = pd.DataFrame( | |
left_df[left_column] | |
.apply(lambda x: process.extractOne(x, right_values)[:2]) | |
.tolist(), | |
columns=["best_match", "score"], | |
) | |
# Add original values | |
results["original_value"] = left_df[left_column].values | |
return results[["original_value", "best_match", "score"]] | |
def get_n_closest_matches( | |
left_df: pd.DataFrame, | |
right_df: pd.DataFrame, | |
left_column: str, | |
right_column: str, | |
n: int, | |
) -> pd.DataFrame: | |
""" | |
Find the top `n` closest matches using rapidfuzz for better performance. | |
n_closest_matches = get_n_closest_matches( | |
left_df, right_df, right_column="address_ext", left_column="address_ext", n=3 | |
) | |
logger.info(f"N Closest Matches:\n{n_closest_matches}") | |
""" | |
right_values = right_df[right_column].tolist() | |
# Process all matches at once using apply | |
results = pd.DataFrame( | |
{ | |
"original_value": left_df[left_column], | |
"matches_data": left_df[left_column].apply( | |
lambda x: process.extract(x, right_values, limit=n) | |
), | |
} | |
) | |
# Extract matches and scores | |
results["matches"] = results["matches_data"].apply( | |
lambda x: [match[0] for match in x] | |
) | |
results["scores"] = results["matches_data"].apply( | |
lambda x: [match[1] for match in x] | |
) | |
return results[["original_value", "matches", "scores"]] | |
if __name__ == "__main__": | |
right_df = pd.read_csv(DOWNLOADS_DIR / "certificates_address_data.csv") | |
left_df = pd.read_csv(DOWNLOADS_DIR / "pricePaid_address_data.csv") | |
# add a test string to both dataframes to test the fuzzy matching | |
test_address = "123 TEST STREET, LONDON" | |
right_df.loc[len(right_df)] = {"address_ext": test_address} | |
left_df.loc[len(left_df)] = {"address_ext": test_address} | |
logger.info(f"Left DataFrame:\n{left_df.head()}") | |
logger.info(f"Right DataFrame:\n{right_df.head()}") | |
# Example usage with the `address_ext` column | |
closest_matches = get_closest_match( | |
left_df, right_df, right_column="address_ext", left_column="address_ext" | |
) | |
logger.info(f"Closest Matches:\n{closest_matches}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment