Created
January 27, 2022 13:31
-
-
Save aeturrell/369b22a1ae015b6b29172fe4c70f7f2c to your computer and use it in GitHub Desktop.
Matching two dataframes of firm names using sparse_dot_topn and sklearn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import glob | |
| import pandas as pd | |
| from tqdm import tqdm | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sparse_dot_topn import awesome_cossim_topn | |
| import string | |
| from nltk.corpus import stopwords | |
| def clean_employer_names(df, col_name): | |
| """ | |
| Lower case | |
| Replace punctuation with whitespace | |
| Max one contiguous whitespace | |
| Keep only unique names | |
| return df with cleaned column | |
| """ | |
| xf = df[col_name].str.lower() | |
| # Replace remaining punctuation with whitespace | |
| xf = xf.str.translate( | |
| str.maketrans(string.punctuation, " " * len(string.punctuation)) | |
| ) | |
| xf = xf.str.replace(r"\s\s+", " ", regex=True) | |
| # # Stop words: | |
| stop_words = ["limited", "the", "ltd", "plc"] | |
| pat = r"\b(?:{})\b".format("|".join(stop_words)) | |
| xf = xf.str.replace(pat, "", regex=True) | |
| xf = xf.str.replace(r"\s+", " ", regex=True) | |
| xf = xf.str.rstrip() | |
| xf = xf.str.lstrip() | |
| return xf | |
| def match_firm_names( | |
| prime, | |
| secon=None, | |
| p_match_col="prime_name", | |
| s_match_col="secon_name", | |
| ): | |
| """ | |
| This script provides an arbitrary matching capability. It is asymmetric in the | |
| sense that one of the input datasets, prime, will be used to define the | |
| vector space. | |
| It takes in pandas dataframes, prime and secon, matches them, and returns | |
| an output data frame. | |
| """ | |
| def clean_names(df, col): | |
| df.loc[:, col] = df[col].astype(str) | |
| # Note that you will need to write this function yourself | |
| df.loc[:, col + "_cln"] = clean_employer_names(df, col) | |
| df = df.drop_duplicates(subset=col + "_cln", keep="first") | |
| # print("Cleaning on " + col + f" complete; {len(df)} records") | |
| return df | |
| if type(prime) != pd.DataFrame: | |
| raise ValueError("prime should be a dataframe.") | |
| prime = clean_names(prime, p_match_col) | |
| secon_flag = True | |
| if secon is None: | |
| secon = prime | |
| s_match_col = p_match_col | |
| secon_flag = False | |
| prefix = "secon_" | |
| secon = secon.rename(columns=dict(zip([x for x in secon.columns], [prefix + x for x in secon.columns]))) | |
| s_match_col = prefix + p_match_col | |
| else: | |
| if type(secon) != pd.DataFrame: | |
| raise ValueError("secon should be a dataframe.") | |
| secon = clean_names(secon, s_match_col) | |
| # prime, secon = exact_matches(prime, secon, p_match_col, | |
| # s_match_col, no_dupes) | |
| secon.loc[:, s_match_col + "_cln"] = secon[s_match_col + "_cln"].astype("string") | |
| secon = secon.dropna(subset=[s_match_col + "_cln"]) | |
| # Prep for matching non-exacts | |
| prime.loc[:, p_match_col + "_cln"] = prime[p_match_col + "_cln"].astype("string") | |
| prime = prime.dropna(subset=[p_match_col + "_cln"]) | |
| # ---------------------- | |
| # Matching | |
| # ---------------------- | |
| # Method - use char level n-grams | |
| vectorizer = TfidfVectorizer( | |
| analyzer="char_wb", ngram_range=(1, 4), max_features=30000, encoding="utf-8" | |
| ) | |
| # Create the tf-idf terms based on characters using the prime | |
| tfidf_prime = vectorizer.fit_transform(prime[p_match_col + "_cln"]) | |
| # Express the second dataframe in vector space of first | |
| tfidf_secon = vectorizer.transform(secon[s_match_col + "_cln"]) | |
| num_matches = 2 | |
| threshold = 0 | |
| n_jobs = 4 # Number of processors to use | |
| mat_of_scores = awesome_cossim_topn(tfidf_prime, tfidf_secon.T, | |
| num_matches, threshold, | |
| use_threads=True, n_jobs=n_jobs) | |
| if(not secon_flag): | |
| # If secon and prime are same datasets, fill diagonal with zero as this is | |
| # the trivial match | |
| mat_of_scores.setdiag(0) | |
| max_indexes = np.squeeze(np.asarray(np.argmax(mat_of_scores, 1))) | |
| top_scores = np.max(mat_of_scores, 1) # vector of size secon entries | |
| matches = pd.concat( | |
| [ | |
| prime.reset_index(drop=True), | |
| secon.iloc[max_indexes].reset_index(drop=True), | |
| pd.DataFrame(top_scores.toarray()), | |
| ], | |
| axis=1, | |
| join="inner", | |
| ) | |
| matches = matches.reset_index(drop=True).rename(columns={0: "match_score"}) | |
| return matches |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment