Created
April 11, 2020 22:00
-
-
Save rtempleton/ef72925f3bb04cb2f7df34f183f3efd1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use role sysadmin; | |
-- set up dev environment | |
create database if not exists my_test_db; | |
create schema if not exists fuzzy_match; | |
create warehouse if not exists dev_wh warehouse_size = 'small' auto_suspend = 300 initially_suspended=true; | |
use schema my_test_db.fuzzy_match; | |
use warehouse dev_wh; | |
-- define the standardize function | |
create or replace function standardize(a String) | |
returns string | |
strict immutable | |
COMMENT = 'Removes non-alphanumeric characters and casts the result to UPPER case' | |
as $$ select REGEXP_REPLACE(UPPER(a),'[^A-Z0-9 ]', '') $$; | |
-- create a samples data set derived from the TPCDS sample datasets - standardize these fields while creating the table | |
create or replace transient table fuzzy_match.samples as( | |
select | |
standardize(a.c_customer_sk) as cust_key | |
, standardize(a.c_current_addr_sk) as add_key | |
, standardize(a.c_first_name) as fname | |
, standardize(a.c_last_name) as lname | |
, standardize(concat(b.ca_street_number, ' ', b.ca_street_name, ' ', b.ca_street_type)) as address | |
, standardize(b.ca_city) as city | |
, standardize(b.ca_state) as state | |
, standardize(b.ca_zip) as zip | |
from "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER" a | |
inner join "SNOWFLAKE_SAMPLE_DATA"."TPCDS_SF10TCL"."CUSTOMER_ADDRESS" b | |
on a.c_current_addr_sk = b.ca_address_sk | |
where a.c_first_name is not null and | |
a.c_last_name is not null and | |
address is not null and | |
ca_state is not null and | |
ca_zip is not null) | |
limit 500000; --dial this down to limit the amount of computing that gets used later | |
-- Define a dictionary table | |
create or replace transient table dictionary ( | |
token String not null, | |
term String not null, | |
context String not null, | |
constraint UK unique (token, context) | |
); | |
/* | |
Lets look for some candidate names to add to the dictionary | |
get back all the fnames that are less than 5 characters | |
*/ | |
select distinct fname from samples where len(fname) < 5; | |
--MAX is one of the returned values, let's see what other names in the SAMPLES align to MAX | |
select distinct fname from samples where fname like ('MAX%'); | |
--add some name terms to the dictionary | |
insert into dictionary values ('MAXINE', 'MAX', 'NAME'), ('MAXIMO', 'MAX', 'NAME'), ('MAXIE', 'MAX', 'NAME'), ('MAXWELL', 'MAX', 'NAME'); | |
insert into dictionary values ('JEFFREY', 'JEFF', 'NAME'), ('JEFFERY', 'JEFF', 'NAME'), ('JEFFRY', 'JEFF', 'NAME'), ('JEFFIE', 'JEFF', 'NAME'), ('JEFFERSON', 'JEFF', 'NAME'); | |
insert into dictionary values ('JACKIE', 'JACK', 'NAME'), ('JACKSON', 'JACK', 'NAME'), ('JACKI', 'JACK', 'NAME'), ('JACKLYN', 'JACK', 'NAME'), ('JACKELYN', 'JACK', 'NAME'); | |
insert into dictionary values ('LEONA', 'LEO', 'NAME'), ('LEONARDO', 'LEO', 'NAME'), ('LEONILA', 'LEO', 'NAME'), ('LEOLA', 'LEO', 'NAME'), ('LEON', 'LEO', 'NAME'), ('LEONARD', 'LEO', 'NAME'), ('LEONIDA', 'LEO', 'NAME'); | |
--add some address terms to the dictionary | |
insert into dictionary values ('AVENUE', 'AVE', 'ADDRESS'), ('STREET', 'ST', 'ADDRESS'), ('BOULEVARD', 'BLVD', 'ADDRESS'), ('COURT', 'CT', 'ADDRESS'), ('LANE', 'LN', 'ADDRESS'), ('CIRCLE', 'CIR', 'ADDRESS'), ('PARKWAY', 'PKWY', 'ADDRESS'), ('WAY', 'WY', 'ADDRESS'), ('DRIVE', 'DR', 'ADDRESS'); | |
insert into dictionary values ('FIRST', '1ST', 'ADDRESS'), ('SECOND', '2ND', 'ADDRESS'), ('THIRD', '3RD', 'ADDRESS'), ('FOURTH', '4TH', 'ADDRESS'), ('FIFTH', '5TH', 'ADDRESS'), ('SIXTH', '6TH', 'ADDRESS'), ('SEVENTH', '7TH', 'ADDRESS'), ('EIGHTH', '8TH', 'ADDRESS'), ('NINTH', '9TH', 'ADDRESS'); | |
--alter the samples table to hold standardized fname and address values | |
alter table samples add column fname_std string, address_std string; | |
--apply the standardized fname to the new fname_std column | |
update samples s set fname_std = d.name from ( | |
select | |
a.cust_key, | |
coalesce(b.term, a.fname) name | |
from samples a | |
left join dictionary b | |
on a.fname = b.token and context = 'NAME') d | |
where s.cust_key = d.cust_key; | |
--see the records that were updated | |
select * from samples where fname != fname_std; | |
/* | |
The query below tokenizes the address field from the samples table and then performs lookups into the dictionary table. | |
It then puts the tokens back together and saves the new standardized address values in a temp table. Note we use the unique | |
CUST_KEY as there may exist duplicate ADD_KEYS (such in cases of households) | |
*/ | |
create or replace temp table standard_address as ( | |
select | |
cust_key, | |
trim(REGEXP_REPLACE(x,'[^A-Z0-9 ]', ' ')) address from ( | |
select | |
cust_key, | |
to_json(array_agg(v) within group (order by idx asc)) x from( | |
with tokenized as ( | |
select | |
t.index, | |
t.value, | |
s.cust_key | |
from samples s, lateral split_to_table(s.address, ' ') t | |
where value > '' | |
) | |
select | |
t.cust_key, | |
t.index idx, | |
coalesce(d.term, t.value) v | |
from tokenized t | |
left join dictionary d | |
on t.value = d.token and d.context = 'ADDRESS' | |
order by 1,2 | |
) | |
group by cust_key)); | |
select * from standard_address limit 20; | |
--Join the updated address values back into our samples table | |
update samples s set s.address_std = a.address from | |
standard_address a | |
where s.cust_key = a.cust_key; | |
--define out fuzzy_scoring UDF | |
create or replace function fuzzy_score(a String, b String) | |
returns number | |
strict | |
immutable | |
COMMENT = 'Takes two strings and returns a similarity score between 1 and 0' | |
as 'select 1.0-(editdistance(a, b)/greatest(length(a),length(b)))'; | |
-- use this query to check the number of pairs that will be generated based on your blocking key | |
select | |
sum(pairs) pairs from ( | |
select | |
state, | |
count(*) recs, | |
(count(*)*(count(*)-1)/2)::number(12,0) pairs | |
from samples | |
group by 1); | |
/* | |
Generate pairs for downstream scoring evaluation | |
Note this uses ZIP as the blocking key | |
Consider sorting the source table by the zip and cust_key for better performance | |
create or replace table samples as (select * from samples order by zip, cust_key); | |
*/ | |
create or replace transient table candidate_pairs as ( | |
select | |
t1.cust_key cust_key1 | |
, t2.cust_key cust_key2 | |
, t1.fname fname1 | |
, t2.fname fname2 | |
, t1.lname lname1 | |
, t2.lname lname2 | |
, t1.address address1 | |
, t2.address address2 | |
, t1.fname_std fname_std1 | |
, t2.fname_std fname_std2 | |
, t1.address_std address_std1 | |
, t2.address_std address_std2 | |
from samples t1 | |
inner join samples t2 on t1.zip = t2.zip and t1.cust_key < t2.cust_key); | |
--flex the warehouse up, run the scoring then flex back down again | |
alter warehouse dev_wh set warehouse_size = 'XLARGE'; | |
create or replace transient table scores as( | |
select | |
cust_key1 || '-' || cust_key2 pair_key | |
, cust_key1 | |
, cust_key2 | |
, fuzzy_score(fname1, fname2) fname_fuzzy | |
, fuzzy_score(soundex(fname1), soundex(fname2)) fname_soundex | |
, fuzzy_score(fname_std1, fname_std2) fname_std_fuzzy | |
, fuzzy_score(lname1, lname2) lname_fuzzy | |
, fuzzy_score(soundex(lname1), soundex(lname2)) lname_soundex | |
, fuzzy_score(address1, address2) address_fuzzy | |
, fuzzy_score(address_std1, address_std2) address_std_fuzzy | |
from candidate_pairs); | |
alter warehouse dev_wh set warehouse_size = 'SMALL'; | |
-- Look at some records that are potential duplicates - play with the thresholds and other score values to see compared records in over/under comparison | |
with candidates as ( | |
select | |
pair_key, | |
cust_key1, | |
cust_key2 | |
from scores | |
where fname_soundex >= .75 and fname_fuzzy >= .75 and lname_fuzzy >= .75 and address_std_fuzzy >= .50 | |
) | |
select | |
pair_key, | |
samples.* | |
from candidates | |
inner join samples on cust_key = cust_key1 | |
union | |
select | |
pair_key, | |
samples.* | |
from candidates | |
inner join samples on cust_key = cust_key2 | |
order by pair_key, cust_key | |
limit 100; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment