Skip to content

Instantly share code, notes, and snippets.

@aeturrell
Created January 27, 2022 13:31
Show Gist options
  • Select an option

  • Save aeturrell/369b22a1ae015b6b29172fe4c70f7f2c to your computer and use it in GitHub Desktop.

Select an option

Save aeturrell/369b22a1ae015b6b29172fe4c70f7f2c to your computer and use it in GitHub Desktop.
Matching two dataframes of firm names using sparse_dot_topn and sklearn
import os
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn
import string
from nltk.corpus import stopwords
def clean_employer_names(df, col_name):
"""
Lower case
Replace punctuation with whitespace
Max one contiguous whitespace
Keep only unique names
return df with cleaned column
"""
xf = df[col_name].str.lower()
# Replace remaining punctuation with whitespace
xf = xf.str.translate(
str.maketrans(string.punctuation, " " * len(string.punctuation))
)
xf = xf.str.replace(r"\s\s+", " ", regex=True)
# # Stop words:
stop_words = ["limited", "the", "ltd", "plc"]
pat = r"\b(?:{})\b".format("|".join(stop_words))
xf = xf.str.replace(pat, "", regex=True)
xf = xf.str.replace(r"\s+", " ", regex=True)
xf = xf.str.rstrip()
xf = xf.str.lstrip()
return xf
def match_firm_names(
prime,
secon=None,
p_match_col="prime_name",
s_match_col="secon_name",
):
"""
This script provides an arbitrary matching capability. It is asymmetric in the
sense that one of the input datasets, prime, will be used to define the
vector space.
It takes in pandas dataframes, prime and secon, matches them, and returns
an output data frame.
"""
def clean_names(df, col):
df.loc[:, col] = df[col].astype(str)
# Note that you will need to write this function yourself
df.loc[:, col + "_cln"] = clean_employer_names(df, col)
df = df.drop_duplicates(subset=col + "_cln", keep="first")
# print("Cleaning on " + col + f" complete; {len(df)} records")
return df
if type(prime) != pd.DataFrame:
raise ValueError("prime should be a dataframe.")
prime = clean_names(prime, p_match_col)
secon_flag = True
if secon is None:
secon = prime
s_match_col = p_match_col
secon_flag = False
prefix = "secon_"
secon = secon.rename(columns=dict(zip([x for x in secon.columns], [prefix + x for x in secon.columns])))
s_match_col = prefix + p_match_col
else:
if type(secon) != pd.DataFrame:
raise ValueError("secon should be a dataframe.")
secon = clean_names(secon, s_match_col)
# prime, secon = exact_matches(prime, secon, p_match_col,
# s_match_col, no_dupes)
secon.loc[:, s_match_col + "_cln"] = secon[s_match_col + "_cln"].astype("string")
secon = secon.dropna(subset=[s_match_col + "_cln"])
# Prep for matching non-exacts
prime.loc[:, p_match_col + "_cln"] = prime[p_match_col + "_cln"].astype("string")
prime = prime.dropna(subset=[p_match_col + "_cln"])
# ----------------------
# Matching
# ----------------------
# Method - use char level n-grams
vectorizer = TfidfVectorizer(
analyzer="char_wb", ngram_range=(1, 4), max_features=30000, encoding="utf-8"
)
# Create the tf-idf terms based on characters using the prime
tfidf_prime = vectorizer.fit_transform(prime[p_match_col + "_cln"])
# Express the second dataframe in vector space of first
tfidf_secon = vectorizer.transform(secon[s_match_col + "_cln"])
num_matches = 2
threshold = 0
n_jobs = 4 # Number of processors to use
mat_of_scores = awesome_cossim_topn(tfidf_prime, tfidf_secon.T,
num_matches, threshold,
use_threads=True, n_jobs=n_jobs)
if(not secon_flag):
# If secon and prime are same datasets, fill diagonal with zero as this is
# the trivial match
mat_of_scores.setdiag(0)
max_indexes = np.squeeze(np.asarray(np.argmax(mat_of_scores, 1)))
top_scores = np.max(mat_of_scores, 1) # vector of size secon entries
matches = pd.concat(
[
prime.reset_index(drop=True),
secon.iloc[max_indexes].reset_index(drop=True),
pd.DataFrame(top_scores.toarray()),
],
axis=1,
join="inner",
)
matches = matches.reset_index(drop=True).rename(columns={0: "match_score"})
return matches
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment