Created
August 4, 2015 11:25
-
-
Save elliottcordo/a883894f1defd5ab578c to your computer and use it in GitHub Desktop.
simple non-fuzzy customer matching in postgres
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create index ix_customer_first_last on mstr.d_customer(first_name, last_name); | |
create index ix_email on mstr.d_customer(email); | |
drop table cust_matches; | |
--1. first name matches | |
create temporary table cust_matches as | |
select c.customer_key as customer_key_1, c.first_name as first_name_1,c.last_name as last_name_1, c.email as email_1, | |
c1.customer_key as customer_key_2, c1.first_name as first_name_2, c1.last_name as last_name_2, c1.email as email_2, | |
.5::float as match_score | |
from mstr.d_customer c | |
join mstr.d_customer c1 on c.first_name=c1.first_name and c.last_name = c1.last_name | |
where c.customer_key <> c1.customer_key | |
and length(c.last_name) > 3 and length(c.first_name) > 3; | |
--2. now email | |
insert into cust_matches | |
select c.customer_key as customer_key_1, c.first_name as first_name_1,c.last_name as last_name_1, c.email as email_1, | |
c1.customer_key as customer_key_2, c1.first_name as first_name_2, c1.last_name as last_name_2, c1.email as email_2, | |
.8::float as match_score | |
from mstr.d_customer c | |
join mstr.d_customer c1 on c.email = c1.email | |
where c.customer_key <> c1.customer_key | |
and c.email like '%@%' | |
and c.email not in ( | |
select email from mstr.d_customer | |
group by email, source | |
having count(1) > 10); --or some reasonable number | |
--3. now location (with score of .25) | |
--4. now company (score of .25) - need to figure out how to do this (D&B append?) | |
--5. once this is all done we will need to dedup this table as you will have matches duplicated from the right and left side | |
--6. sum resulting scores | |
--7. recursively "walk" the table to find ultimate parent (selecting lowest number customer_key) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment