Skip to content

Instantly share code, notes, and snippets.

@audhiaprilliant
Last active February 12, 2021 12:51
Show Gist options
  • Select an option

  • Save audhiaprilliant/d7e1d4196ae38eb85f12a763e83aa07b to your computer and use it in GitHub Desktop.

Select an option

Save audhiaprilliant/d7e1d4196ae38eb85f12a763e83aa07b to your computer and use it in GitHub Desktop.
Fuzzy String Matching
# Import module for data manipulation
import pandas as pd
# Import module for linear algebra
import numpy as np
# Import module for Fuzzy string matching
from fuzzywuzzy import fuzz, process
# Import module for binary search
def stringMatching(
df: pd.DataFrame,
column: str,
clean: pd.Series,
mapping_df: pd.DataFrame,
col: str
):
# Create vectorizer
categoryClean = clean.drop_duplicates().reset_index(drop = True)
categoryMessy = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
categoryFuzzy = {}
ratioFuzzy = {}
for i in range(len(categoryMessy)):
resultFuzzy = process.extractOne(categoryMessy[i].lower(), categoryClean)
# Mapping
catFuzzy = {categoryMessy[i]:resultFuzzy[0]}
ratFuzzy = {categoryMessy[i]:resultFuzzy[1]}
# Save result
categoryFuzzy.update(catFuzzy)
# Save the ratio
ratioFuzzy.update(ratFuzzy)
# Create column names
catCol = col
ratCol = 'Ratio'
# Merge the result
df[catCol] = df[column]
df[ratCol] = df[column]
# Mapping the result
df[catCol] = df[catCol].map(categoryFuzzy)
df[ratCol] = df[ratCol].map(ratioFuzzy)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment