Skip to content

Instantly share code, notes, and snippets.

@tommylees112
Created March 19, 2025 14:56
Show Gist options
  • Save tommylees112/3cd536dc72b4fc43f7d3a87e69b269c1 to your computer and use it in GitHub Desktop.
Save tommylees112/3cd536dc72b4fc43f7d3a87e69b269c1 to your computer and use it in GitHub Desktop.
from pathlib import Path
import pandas as pd
from loguru import logger
from rapidfuzz import process
DOWNLOADS_DIR = Path.home() / "Downloads"
def get_closest_match(
left_df: pd.DataFrame, right_df: pd.DataFrame, left_column: str, right_column: str
) -> pd.DataFrame:
"""
Find the best match for each value in the specified column of `left_df`
from the `column` in `right_df` using rapidfuzz for better performance.
"""
right_values = right_df[right_column].tolist()
# Extract only the first two elements (match, score, ~index) from the tuple
results = pd.DataFrame(
left_df[left_column]
.apply(lambda x: process.extractOne(x, right_values)[:2])
.tolist(),
columns=["best_match", "score"],
)
# Add original values
results["original_value"] = left_df[left_column].values
return results[["original_value", "best_match", "score"]]
def get_n_closest_matches(
left_df: pd.DataFrame,
right_df: pd.DataFrame,
left_column: str,
right_column: str,
n: int,
) -> pd.DataFrame:
"""
Find the top `n` closest matches using rapidfuzz for better performance.
n_closest_matches = get_n_closest_matches(
left_df, right_df, right_column="address_ext", left_column="address_ext", n=3
)
logger.info(f"N Closest Matches:\n{n_closest_matches}")
"""
right_values = right_df[right_column].tolist()
# Process all matches at once using apply
results = pd.DataFrame(
{
"original_value": left_df[left_column],
"matches_data": left_df[left_column].apply(
lambda x: process.extract(x, right_values, limit=n)
),
}
)
# Extract matches and scores
results["matches"] = results["matches_data"].apply(
lambda x: [match[0] for match in x]
)
results["scores"] = results["matches_data"].apply(
lambda x: [match[1] for match in x]
)
return results[["original_value", "matches", "scores"]]
if __name__ == "__main__":
right_df = pd.read_csv(DOWNLOADS_DIR / "certificates_address_data.csv")
left_df = pd.read_csv(DOWNLOADS_DIR / "pricePaid_address_data.csv")
# add a test string to both dataframes to test the fuzzy matching
test_address = "123 TEST STREET, LONDON"
right_df.loc[len(right_df)] = {"address_ext": test_address}
left_df.loc[len(left_df)] = {"address_ext": test_address}
logger.info(f"Left DataFrame:\n{left_df.head()}")
logger.info(f"Right DataFrame:\n{right_df.head()}")
# Example usage with the `address_ext` column
closest_matches = get_closest_match(
left_df, right_df, right_column="address_ext", left_column="address_ext"
)
logger.info(f"Closest Matches:\n{closest_matches}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment