aeturrell · January 27, 2022 13:31
diff --git a/example_firm_matching.py b/example_firm_matching.py
 import os
 import glob
 import pandas as pd
 from tqdm import tqdm
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sparse_dot_topn import awesome_cossim_topn
 import string
 from nltk.corpus import stopwords


 def clean_employer_names(df, col_name):
    """
    Lower case
    Replace punctuation with whitespace
    Max one contiguous whitespace
    Keep only unique names
    return df with cleaned column
    """
    xf = df[col_name].str.lower()
    # Replace remaining punctuation with whitespace
    xf = xf.str.translate(
        str.maketrans(string.punctuation, " " * len(string.punctuation))
    )
    xf = xf.str.replace(r"\s\s+", " ", regex=True)
    # # Stop words:
    stop_words = ["limited", "the", "ltd", "plc"]
    pat = r"\b(?:{})\b".format("|".join(stop_words))
    xf = xf.str.replace(pat, "", regex=True)
    xf = xf.str.replace(r"\s+", " ", regex=True)
    xf = xf.str.rstrip()
    xf = xf.str.lstrip()
    return xf



 def match_firm_names(
    prime,
    secon=None,
    p_match_col="prime_name",
    s_match_col="secon_name",
 ):
    """
    This script provides an arbitrary matching capability. It is asymmetric in the
    sense that one of the input datasets, prime, will be used to define the
    vector space.
    It takes in pandas dataframes, prime and secon, matches them, and returns
    an output data frame.
    """

    def clean_names(df, col):
        df.loc[:, col] = df[col].astype(str)
        # Note that you will need to write this function yourself
        df.loc[:, col + "_cln"] = clean_employer_names(df, col)
        df = df.drop_duplicates(subset=col + "_cln", keep="first")
        # print("Cleaning on " + col + f" complete; {len(df)} records")
        return df

    if type(prime) != pd.DataFrame:
        raise ValueError("prime should be a dataframe.")

    prime = clean_names(prime, p_match_col)
    secon_flag = True
    if secon is None:
        secon = prime
        s_match_col = p_match_col
        secon_flag = False
        prefix = "secon_"
        secon = secon.rename(columns=dict(zip([x for x in secon.columns], [prefix + x for x in secon.columns])))
        s_match_col = prefix + p_match_col
    else:
        if type(secon) != pd.DataFrame:
            raise ValueError("secon should be a dataframe.")
        secon = clean_names(secon, s_match_col)
        # prime, secon = exact_matches(prime, secon, p_match_col,
        #                              s_match_col, no_dupes)
        secon.loc[:, s_match_col + "_cln"] = secon[s_match_col + "_cln"].astype("string")
        secon = secon.dropna(subset=[s_match_col + "_cln"])

    # Prep for matching non-exacts
    prime.loc[:, p_match_col + "_cln"] = prime[p_match_col + "_cln"].astype("string")
    prime = prime.dropna(subset=[p_match_col + "_cln"])

    # ----------------------
    # Matching
    # ----------------------
    # Method - use char level n-grams
    vectorizer = TfidfVectorizer(
        analyzer="char_wb", ngram_range=(1, 4), max_features=30000, encoding="utf-8"
    )
    # Create the tf-idf terms based on characters using the prime
    tfidf_prime = vectorizer.fit_transform(prime[p_match_col + "_cln"])
    # Express the second dataframe in vector space of first
    tfidf_secon = vectorizer.transform(secon[s_match_col + "_cln"])
    num_matches = 2
    threshold = 0
    n_jobs = 4  # Number of processors to use
    mat_of_scores = awesome_cossim_topn(tfidf_prime, tfidf_secon.T,
                                        num_matches, threshold,
                                        use_threads=True, n_jobs=n_jobs)
    if(not secon_flag):
        # If secon and prime are same datasets, fill diagonal with zero as this is
        # the trivial match
        mat_of_scores.setdiag(0)
    max_indexes = np.squeeze(np.asarray(np.argmax(mat_of_scores, 1)))
    top_scores = np.max(mat_of_scores, 1)  # vector of size secon entries
    matches = pd.concat(
        [
            prime.reset_index(drop=True),
            secon.iloc[max_indexes].reset_index(drop=True),
            pd.DataFrame(top_scores.toarray()),
        ],
        axis=1,
        join="inner",
    )
    matches = matches.reset_index(drop=True).rename(columns={0: "match_score"})
    return matches
	import os
	import glob
	import pandas as pd
	from tqdm import tqdm
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sparse_dot_topn import awesome_cossim_topn
	import string
	from nltk.corpus import stopwords


	def clean_employer_names(df, col_name):
	"""
	Lower case
	Replace punctuation with whitespace
	Max one contiguous whitespace
	Keep only unique names
	return df with cleaned column
	"""
	xf = df[col_name].str.lower()
	# Replace remaining punctuation with whitespace
	xf = xf.str.translate(
	str.maketrans(string.punctuation, " " * len(string.punctuation))
	)
	xf = xf.str.replace(r"\s\s+", " ", regex=True)
	# # Stop words:
	stop_words = ["limited", "the", "ltd", "plc"]
	pat = r"\b(?:{})\b".format("\|".join(stop_words))
	xf = xf.str.replace(pat, "", regex=True)
	xf = xf.str.replace(r"\s+", " ", regex=True)
	xf = xf.str.rstrip()
	xf = xf.str.lstrip()
	return xf



	def match_firm_names(
	prime,
	secon=None,
	p_match_col="prime_name",
	s_match_col="secon_name",
	):
	"""
	This script provides an arbitrary matching capability. It is asymmetric in the
	sense that one of the input datasets, prime, will be used to define the
	vector space.
	It takes in pandas dataframes, prime and secon, matches them, and returns
	an output data frame.
	"""

	def clean_names(df, col):
	df.loc[:, col] = df[col].astype(str)
	# Note that you will need to write this function yourself
	df.loc[:, col + "_cln"] = clean_employer_names(df, col)
	df = df.drop_duplicates(subset=col + "_cln", keep="first")
	# print("Cleaning on " + col + f" complete; {len(df)} records")
	return df

	if type(prime) != pd.DataFrame:
	raise ValueError("prime should be a dataframe.")

	prime = clean_names(prime, p_match_col)
	secon_flag = True
	if secon is None:
	secon = prime
	s_match_col = p_match_col
	secon_flag = False
	prefix = "secon_"
	secon = secon.rename(columns=dict(zip([x for x in secon.columns], [prefix + x for x in secon.columns])))
	s_match_col = prefix + p_match_col
	else:
	if type(secon) != pd.DataFrame:
	raise ValueError("secon should be a dataframe.")
	secon = clean_names(secon, s_match_col)
	# prime, secon = exact_matches(prime, secon, p_match_col,
	# s_match_col, no_dupes)
	secon.loc[:, s_match_col + "_cln"] = secon[s_match_col + "_cln"].astype("string")
	secon = secon.dropna(subset=[s_match_col + "_cln"])

	# Prep for matching non-exacts
	prime.loc[:, p_match_col + "_cln"] = prime[p_match_col + "_cln"].astype("string")
	prime = prime.dropna(subset=[p_match_col + "_cln"])

	# ----------------------
	# Matching
	# ----------------------
	# Method - use char level n-grams
	vectorizer = TfidfVectorizer(
	analyzer="char_wb", ngram_range=(1, 4), max_features=30000, encoding="utf-8"
	)
	# Create the tf-idf terms based on characters using the prime
	tfidf_prime = vectorizer.fit_transform(prime[p_match_col + "_cln"])
	# Express the second dataframe in vector space of first
	tfidf_secon = vectorizer.transform(secon[s_match_col + "_cln"])
	num_matches = 2
	threshold = 0
	n_jobs = 4 # Number of processors to use
	mat_of_scores = awesome_cossim_topn(tfidf_prime, tfidf_secon.T,
	num_matches, threshold,
	use_threads=True, n_jobs=n_jobs)
	if(not secon_flag):
	# If secon and prime are same datasets, fill diagonal with zero as this is
	# the trivial match
	mat_of_scores.setdiag(0)
	max_indexes = np.squeeze(np.asarray(np.argmax(mat_of_scores, 1)))
	top_scores = np.max(mat_of_scores, 1) # vector of size secon entries
	matches = pd.concat(
	[
	prime.reset_index(drop=True),
	secon.iloc[max_indexes].reset_index(drop=True),
	pd.DataFrame(top_scores.toarray()),
	],
	axis=1,
	join="inner",
	)
	matches = matches.reset_index(drop=True).rename(columns={0: "match_score"})
	return matches
No results found