rtempleton · April 11, 2020 22:00
diff --git a/SnowflakeFuzzyMatching1.sql b/SnowflakeFuzzyMatching1.sql
 use role sysadmin;

 -- set up dev environment
 create database if not exists my_test_db;
 create schema if not exists fuzzy_match;
 create warehouse if not exists dev_wh warehouse_size = 'small' auto_suspend = 300 initially_suspended=true;
 use schema my_test_db.fuzzy_match;
 use warehouse dev_wh;


 -- define the standardize function
 create or replace function standardize(a String) 
 returns string 
 strict immutable 
 COMMENT = 'Removes non-alphanumeric characters and casts the result to UPPER case'
 as $$ select REGEXP_REPLACE(UPPER(a),'[^A-Z0-9 ]', '') $$;


 -- create a samples data set derived from the TPCDS sample datasets - standardize these fields while creating the table
 create or replace transient table fuzzy_match.samples as(
 select 
 	standardize(a.c_customer_sk) as cust_key
 	, standardize(a.c_current_addr_sk) as add_key
 	, standardize(a.c_first_name) as fname
 	, standardize(a.c_last_name) as lname
 	, standardize(concat(b.ca_street_number, ' ', b.ca_street_name, ' ', b.ca_street_type)) as address
 	, standardize(b.ca_city) as city
 	, standardize(b.ca_state) as state
 	, standardize(b.ca_zip) as zip
 from "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER" a
 inner join "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER_ADDRESS" b
 on a.c_current_addr_sk = b.ca_address_sk
 where a.c_first_name is not null and
 a.c_last_name is not null and
 address is not null and
 ca_state is not null and
 ca_zip is not null)
 limit 500000; --dial this down to limit the amount of computing that gets used later


 -- Define a dictionary table
 create or replace transient table dictionary (
 token String not null,
 term String not null,
 context String not null,
 constraint UK unique (token, context)
 );

 /*
 	Lets look for some candidate names to add to the dictionary
 	get back all the fnames that are less than 5 characters
 */
 select distinct fname from samples where len(fname) < 5;

 --MAX is one of the returned values, let's see what other names in the SAMPLES align to MAX
 select distinct fname from samples where fname like ('MAX%');

 --add some name terms to the dictionary
 insert into dictionary values ('MAXINE', 'MAX', 'NAME'), ('MAXIMO', 'MAX', 'NAME'), ('MAXIE', 'MAX', 'NAME'), ('MAXWELL', 'MAX', 'NAME');
 insert into dictionary values ('JEFFREY', 'JEFF', 'NAME'), ('JEFFERY', 'JEFF', 'NAME'), ('JEFFRY', 'JEFF', 'NAME'), ('JEFFIE', 'JEFF', 'NAME'), ('JEFFERSON', 'JEFF', 'NAME');
 insert into dictionary values ('JACKIE', 'JACK', 'NAME'), ('JACKSON', 'JACK', 'NAME'), ('JACKI', 'JACK', 'NAME'), ('JACKLYN', 'JACK', 'NAME'), ('JACKELYN', 'JACK', 'NAME');
 insert into dictionary values ('LEONA', 'LEO', 'NAME'), ('LEONARDO', 'LEO', 'NAME'), ('LEONILA', 'LEO', 'NAME'), ('LEOLA', 'LEO', 'NAME'), ('LEON', 'LEO', 'NAME'), ('LEONARD', 'LEO', 'NAME'), ('LEONIDA', 'LEO', 'NAME');


 --add some address terms to the dictionary
 insert into dictionary values ('AVENUE', 'AVE', 'ADDRESS'), ('STREET', 'ST', 'ADDRESS'), ('BOULEVARD', 'BLVD', 'ADDRESS'), ('COURT', 'CT', 'ADDRESS'), ('LANE', 'LN', 'ADDRESS'), ('CIRCLE', 'CIR', 'ADDRESS'), ('PARKWAY', 'PKWY', 'ADDRESS'), ('WAY', 'WY', 'ADDRESS'), ('DRIVE', 'DR', 'ADDRESS');
 insert into dictionary values ('FIRST', '1ST', 'ADDRESS'), ('SECOND', '2ND', 'ADDRESS'), ('THIRD', '3RD', 'ADDRESS'), ('FOURTH', '4TH', 'ADDRESS'), ('FIFTH', '5TH', 'ADDRESS'), ('SIXTH', '6TH', 'ADDRESS'), ('SEVENTH', '7TH', 'ADDRESS'), ('EIGHTH', '8TH', 'ADDRESS'), ('NINTH', '9TH', 'ADDRESS');

 --alter the samples table to hold standardized fname and address values
 alter table samples add column fname_std string, address_std string;


 --apply the standardized fname to the new fname_std column
 update samples s set fname_std = d.name from (
    select
        a.cust_key,
        coalesce(b.term, a.fname) name
    from samples a
    left join dictionary b
    on a.fname = b.token and context = 'NAME') d
 where s.cust_key = d.cust_key;


 --see the records that were updated
 select * from samples where fname != fname_std;

 /*
 	The query below tokenizes the address field from the samples table and then performs lookups into the dictionary table.
 	It then puts the tokens back together and saves the new standardized address values in a temp table. Note we use the unique
 	CUST_KEY as there may exist duplicate ADD_KEYS (such in cases of households)

 */
 create or replace temp table standard_address as (
 select 
    cust_key,
    trim(REGEXP_REPLACE(x,'[^A-Z0-9 ]', ' ')) address from (
 select 
  cust_key,
  to_json(array_agg(v) within group (order by idx asc)) x from(

 with tokenized as (
    select 
        t.index,
        t.value,
        s.cust_key
    from samples s, lateral split_to_table(s.address, ' ') t
        where value > ''
  )
 select
    t.cust_key,
    t.index idx,
    coalesce(d.term, t.value) v
 from tokenized t
 left join dictionary d
 on t.value = d.token and d.context = 'ADDRESS'
 order by 1,2
  )
 group by cust_key));
 select * from standard_address limit 20;


 --Join the updated address values back into our samples table
 update samples s set s.address_std = a.address from 
    standard_address a 
    where s.cust_key = a.cust_key;

    
 --define out fuzzy_scoring UDF
 create or replace function fuzzy_score(a String, b String)
 returns number
 strict
 immutable
 COMMENT = 'Takes two strings and returns a similarity score between 1 and 0'
 as 'select 1.0-(editdistance(a, b)/greatest(length(a),length(b)))'; 


 -- use this query to check the number of pairs that will be generated based on your blocking key
 select
    sum(pairs) pairs from (
 select
    state,
    count(*) recs,
    (count(*)*(count(*)-1)/2)::number(12,0) pairs
 from samples
 group by 1);  

 /*
 	Generate pairs for downstream scoring evaluation
 	Note this uses ZIP as the blocking key

 	Consider sorting the source table by the zip and cust_key for better performance
 	create or replace table samples as (select * from samples order by zip, cust_key);
 */
 create or replace transient table candidate_pairs as (
    select
    t1.cust_key cust_key1 
    , t2.cust_key cust_key2
    , t1.fname fname1 
    , t2.fname fname2
    , t1.lname lname1 
    , t2.lname lname2
    , t1.address address1 
    , t2.address address2
    , t1.fname_std fname_std1 
    , t2.fname_std fname_std2
    , t1.address_std address_std1 
    , t2.address_std address_std2
 from samples t1
 inner join samples t2 on t1.zip = t2.zip and t1.cust_key < t2.cust_key);


 --flex the warehouse up, run the scoring then flex back down again
 alter warehouse dev_wh set warehouse_size = 'XLARGE';

 create or replace transient table scores as( 
 select
    cust_key1 || '-' || cust_key2 pair_key
    , cust_key1 
    , cust_key2 
    , fuzzy_score(fname1, fname2) fname_fuzzy
    , fuzzy_score(soundex(fname1), soundex(fname2)) fname_soundex
    , fuzzy_score(fname_std1, fname_std2) fname_std_fuzzy
    , fuzzy_score(lname1, lname2) lname_fuzzy 
    , fuzzy_score(soundex(lname1), soundex(lname2)) lname_soundex
    , fuzzy_score(address1, address2) address_fuzzy
    , fuzzy_score(address_std1, address_std2) address_std_fuzzy
 from candidate_pairs);

 alter warehouse dev_wh set warehouse_size = 'SMALL'; 


 -- Look at some records that are potential duplicates - play with the thresholds and other score values to see compared records in over/under comparison
 with candidates as (
    select 
        pair_key,
        cust_key1,
        cust_key2
        from scores
        where fname_soundex >= .75 and fname_fuzzy >= .75 and lname_fuzzy >= .75 and address_std_fuzzy >= .50
 )
 select
    pair_key,
    samples.*
 from candidates
 inner join samples on cust_key = cust_key1
 union
 select
    pair_key,
    samples.*
 from candidates
 inner join samples on cust_key = cust_key2
 order by pair_key, cust_key
 limit 100;
	use role sysadmin;

	-- set up dev environment
	create database if not exists my_test_db;
	create schema if not exists fuzzy_match;
	create warehouse if not exists dev_wh warehouse_size = 'small' auto_suspend = 300 initially_suspended=true;
	use schema my_test_db.fuzzy_match;
	use warehouse dev_wh;


	-- define the standardize function
	create or replace function standardize(a String)
	returns string
	strict immutable
	COMMENT = 'Removes non-alphanumeric characters and casts the result to UPPER case'
	as $$ select REGEXP_REPLACE(UPPER(a),'[^A-Z0-9 ]', '') $$;


	-- create a samples data set derived from the TPCDS sample datasets - standardize these fields while creating the table
	create or replace transient table fuzzy_match.samples as(
	select
	standardize(a.c_customer_sk) as cust_key
	, standardize(a.c_current_addr_sk) as add_key
	, standardize(a.c_first_name) as fname
	, standardize(a.c_last_name) as lname
	, standardize(concat(b.ca_street_number, ' ', b.ca_street_name, ' ', b.ca_street_type)) as address
	, standardize(b.ca_city) as city
	, standardize(b.ca_state) as state
	, standardize(b.ca_zip) as zip
	from "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER" a
	inner join "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER_ADDRESS" b
	on a.c_current_addr_sk = b.ca_address_sk
	where a.c_first_name is not null and
	a.c_last_name is not null and
	address is not null and
	ca_state is not null and
	ca_zip is not null)
	limit 500000; --dial this down to limit the amount of computing that gets used later


	-- Define a dictionary table
	create or replace transient table dictionary (
	token String not null,
	term String not null,
	context String not null,
	constraint UK unique (token, context)
	);

	/*
	Lets look for some candidate names to add to the dictionary
	get back all the fnames that are less than 5 characters
	*/
	select distinct fname from samples where len(fname) < 5;

	--MAX is one of the returned values, let's see what other names in the SAMPLES align to MAX
	select distinct fname from samples where fname like ('MAX%');

	--add some name terms to the dictionary
	insert into dictionary values ('MAXINE', 'MAX', 'NAME'), ('MAXIMO', 'MAX', 'NAME'), ('MAXIE', 'MAX', 'NAME'), ('MAXWELL', 'MAX', 'NAME');
	insert into dictionary values ('JEFFREY', 'JEFF', 'NAME'), ('JEFFERY', 'JEFF', 'NAME'), ('JEFFRY', 'JEFF', 'NAME'), ('JEFFIE', 'JEFF', 'NAME'), ('JEFFERSON', 'JEFF', 'NAME');
	insert into dictionary values ('JACKIE', 'JACK', 'NAME'), ('JACKSON', 'JACK', 'NAME'), ('JACKI', 'JACK', 'NAME'), ('JACKLYN', 'JACK', 'NAME'), ('JACKELYN', 'JACK', 'NAME');
	insert into dictionary values ('LEONA', 'LEO', 'NAME'), ('LEONARDO', 'LEO', 'NAME'), ('LEONILA', 'LEO', 'NAME'), ('LEOLA', 'LEO', 'NAME'), ('LEON', 'LEO', 'NAME'), ('LEONARD', 'LEO', 'NAME'), ('LEONIDA', 'LEO', 'NAME');


	--add some address terms to the dictionary
	insert into dictionary values ('AVENUE', 'AVE', 'ADDRESS'), ('STREET', 'ST', 'ADDRESS'), ('BOULEVARD', 'BLVD', 'ADDRESS'), ('COURT', 'CT', 'ADDRESS'), ('LANE', 'LN', 'ADDRESS'), ('CIRCLE', 'CIR', 'ADDRESS'), ('PARKWAY', 'PKWY', 'ADDRESS'), ('WAY', 'WY', 'ADDRESS'), ('DRIVE', 'DR', 'ADDRESS');
	insert into dictionary values ('FIRST', '1ST', 'ADDRESS'), ('SECOND', '2ND', 'ADDRESS'), ('THIRD', '3RD', 'ADDRESS'), ('FOURTH', '4TH', 'ADDRESS'), ('FIFTH', '5TH', 'ADDRESS'), ('SIXTH', '6TH', 'ADDRESS'), ('SEVENTH', '7TH', 'ADDRESS'), ('EIGHTH', '8TH', 'ADDRESS'), ('NINTH', '9TH', 'ADDRESS');

	--alter the samples table to hold standardized fname and address values
	alter table samples add column fname_std string, address_std string;


	--apply the standardized fname to the new fname_std column
	update samples s set fname_std = d.name from (
	select
	a.cust_key,
	coalesce(b.term, a.fname) name
	from samples a
	left join dictionary b
	on a.fname = b.token and context = 'NAME') d
	where s.cust_key = d.cust_key;


	--see the records that were updated
	select * from samples where fname != fname_std;

	/*
	The query below tokenizes the address field from the samples table and then performs lookups into the dictionary table.
	It then puts the tokens back together and saves the new standardized address values in a temp table. Note we use the unique
	CUST_KEY as there may exist duplicate ADD_KEYS (such in cases of households)

	*/
	create or replace temp table standard_address as (
	select
	cust_key,
	trim(REGEXP_REPLACE(x,'[^A-Z0-9 ]', ' ')) address from (
	select
	cust_key,
	to_json(array_agg(v) within group (order by idx asc)) x from(

	with tokenized as (
	select
	t.index,
	t.value,
	s.cust_key
	from samples s, lateral split_to_table(s.address, ' ') t
	where value > ''
	)
	select
	t.cust_key,
	t.index idx,
	coalesce(d.term, t.value) v
	from tokenized t
	left join dictionary d
	on t.value = d.token and d.context = 'ADDRESS'
	order by 1,2
	)
	group by cust_key));
	select * from standard_address limit 20;


	--Join the updated address values back into our samples table
	update samples s set s.address_std = a.address from
	standard_address a
	where s.cust_key = a.cust_key;


	--define out fuzzy_scoring UDF
	create or replace function fuzzy_score(a String, b String)
	returns number
	strict
	immutable
	COMMENT = 'Takes two strings and returns a similarity score between 1 and 0'
	as 'select 1.0-(editdistance(a, b)/greatest(length(a),length(b)))';


	-- use this query to check the number of pairs that will be generated based on your blocking key
	select
	sum(pairs) pairs from (
	select
	state,
	count(*) recs,
	(count()(count(*)-1)/2)::number(12,0) pairs
	from samples
	group by 1);

	/*
	Generate pairs for downstream scoring evaluation
	Note this uses ZIP as the blocking key

	Consider sorting the source table by the zip and cust_key for better performance
	create or replace table samples as (select * from samples order by zip, cust_key);
	*/
	create or replace transient table candidate_pairs as (
	select
	t1.cust_key cust_key1
	, t2.cust_key cust_key2
	, t1.fname fname1
	, t2.fname fname2
	, t1.lname lname1
	, t2.lname lname2
	, t1.address address1
	, t2.address address2
	, t1.fname_std fname_std1
	, t2.fname_std fname_std2
	, t1.address_std address_std1
	, t2.address_std address_std2
	from samples t1
	inner join samples t2 on t1.zip = t2.zip and t1.cust_key < t2.cust_key);


	--flex the warehouse up, run the scoring then flex back down again
	alter warehouse dev_wh set warehouse_size = 'XLARGE';

	create or replace transient table scores as(
	select
	cust_key1 \|\| '-' \|\| cust_key2 pair_key
	, cust_key1
	, cust_key2
	, fuzzy_score(fname1, fname2) fname_fuzzy
	, fuzzy_score(soundex(fname1), soundex(fname2)) fname_soundex
	, fuzzy_score(fname_std1, fname_std2) fname_std_fuzzy
	, fuzzy_score(lname1, lname2) lname_fuzzy
	, fuzzy_score(soundex(lname1), soundex(lname2)) lname_soundex
	, fuzzy_score(address1, address2) address_fuzzy
	, fuzzy_score(address_std1, address_std2) address_std_fuzzy
	from candidate_pairs);

	alter warehouse dev_wh set warehouse_size = 'SMALL';


	-- Look at some records that are potential duplicates - play with the thresholds and other score values to see compared records in over/under comparison
	with candidates as (
	select
	pair_key,
	cust_key1,
	cust_key2
	from scores
	where fname_soundex >= .75 and fname_fuzzy >= .75 and lname_fuzzy >= .75 and address_std_fuzzy >= .50
	)
	select
	pair_key,
	samples.*
	from candidates
	inner join samples on cust_key = cust_key1
	union
	select
	pair_key,
	samples.*
	from candidates
	inner join samples on cust_key = cust_key2
	order by pair_key, cust_key
	limit 100;