Last active
January 2, 2023 21:13
-
-
Save NReilingh/f5ca23713278ef050f8f41b2f6d08413 to your computer and use it in GitHub Desktop.
Sorenson-dice coefficient in a set-based style. Note -- this is an excerpt and probably does not compile, but may be adapted to your use case. This example is implemented in T-SQL (MS SQL Server)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET ANSI_NULLS, QUOTED_IDENTIFIER ON; | |
GO | |
/*********************************************************************************************************************** | |
Author: Nick Reilingh, Fisher Center at Bard College | |
Description: Preload for sorensen_dice coefficient word pairs on names. | |
Notes: | |
- This procedure is called from LP_FZM_PRELOAD, which supplies it with a dataset_no parameter | |
- It is supplied with dataset parameter from this procedure. | |
- This procedure populates a temp table with letter pairs for a dataset to be matched against later. | |
***********************************************************************************************************************/ | |
CREATE OR ALTER PROCEDURE [fzm].[sorensen_dice_names_PRELOAD] | |
@dataset_no int = NULL | |
AS | |
WITH numbers AS ( | |
SELECT | |
number | |
FROM master.dbo.spt_values | |
WHERE type = 'P' AND number < 100 | |
), loading_names AS ( | |
SELECT | |
n.customer_no, | |
name = ISNULL(NULLIF(n.fname, '') + ' ', '') + ISNULL(n.lname, ''), | |
name_no = ROW_NUMBER() OVER (PARTITION BY n.customer_no ORDER BY (SELECT 1)) | |
FROM dbo.LS_FZM_NAME n | |
), wordboundaries AS ( | |
SELECT | |
n.customer_no, | |
n.name_no, | |
n.name, | |
a.word | |
FROM loading_names n | |
CROSS APPLY ( | |
SELECT | |
word = RTRIM(SUBSTRING(n.name, number + 1, PATINDEX('%[ ][^ ]%', RIGHT(n.name + ' $', DATALENGTH(n.name) + 1 - number)))) | |
FROM numbers | |
WHERE number <= DATALENGTH(n.name) | |
AND PATINDEX('[ ][^ ]%', RIGHT(' ' + n.name, DATALENGTH(n.name) + 1 - number)) = 1 | |
) a | |
), wordpairs AS ( | |
SELECT | |
w.customer_no, | |
w.name_no, | |
w.name, | |
a.letterpair | |
FROM wordboundaries w | |
CROSS APPLY ( | |
SELECT | |
letterpair = SUBSTRING(w.word, number + 1, 2) | |
FROM numbers | |
WHERE number < DATALENGTH(w.word) - 1 | |
) a | |
), letterpairs_enumerated AS ( | |
SELECT | |
p.customer_no, | |
p.name_no, | |
p.name, | |
p.letterpair, | |
letteridx = ROW_NUMBER() OVER (PARTITION BY customer_no, name_no, letterpair ORDER BY (SELECT 1)), | |
paircount = COUNT(1) OVER (PARTITION BY customer_no, name_no) | |
FROM wordpairs p | |
) | |
INSERT tempdb.dbo.[fzm_sorensen_dice_names_tess_letterpairs] | |
SELECT * FROM letterpairs_enumerated; | |
GO | |
/*********************************************************************************************************************** | |
Author: Nick Reilingh, Fisher Center at Bard College | |
Description: Calculates sorensen_dice coefficient. Requires running Preload procedure in the same session. | |
***********************************************************************************************************************/ | |
CREATE OR ALTER FUNCTION [fzm].[sorensen_dice_names]( | |
@dataset_no int, | |
@primary_id varchar(16) | |
) | |
RETURNS TABLE AS RETURN | |
WITH numbers AS ( | |
SELECT | |
number | |
FROM master.dbo.spt_values | |
WHERE type = 'P' AND number < 100 | |
), primary_id_names AS ( | |
SELECT | |
name = ISNULL(NULLIF(n.fname, '') + ' ', '') + ISNULL(lname, ''), | |
name_no = ROW_NUMBER() OVER (ORDER BY (SELECT 1)) | |
FROM dbo.LS_FZM_NAME_INPUT n | |
WHERE n.dataset_no = @dataset_no | |
AND n.primary_id = @primary_id | |
), wordboundaries AS ( | |
SELECT | |
n.name_no, | |
a.word | |
FROM primary_id_names n | |
CROSS APPLY ( | |
SELECT | |
word = RTRIM(SUBSTRING(n.name, number + 1, PATINDEX('%[ ][^ ]%', RIGHT(n.name + ' $', DATALENGTH(n.name) + 1 - number)))) | |
FROM numbers | |
WHERE number <= DATALENGTH(n.name) | |
AND PATINDEX('[ ][^ ]%', RIGHT(' ' + n.name, DATALENGTH(n.name) + 1 - number)) = 1 | |
) a | |
), wordpairs AS ( | |
SELECT | |
w.name_no, | |
a.letterpair | |
FROM wordboundaries w | |
CROSS APPLY ( | |
SELECT | |
letterpair = SUBSTRING(w.word, number + 1, 2) | |
FROM numbers | |
WHERE number < DATALENGTH(w.word) - 1 | |
) a | |
), letterpairs_enumerated AS ( | |
SELECT | |
p.name_no, | |
p.letterpair, | |
letteridx = ROW_NUMBER() OVER (PARTITION BY name_no, letterpair ORDER BY (SELECT 1)), | |
paircount = COUNT(1) OVER (PARTITION BY name_no) | |
FROM wordpairs p | |
), scored AS ( | |
SELECT DISTINCT | |
t.customer_no, | |
score = 200.0 * (COUNT(1) OVER (PARTITION BY t.customer_no, t.name_no, i.name_no)) / (t.paircount + i.paircount), | |
t.name | |
FROM tempdb.dbo.[fzm_sorensen_dice_names_tess_letterpairs] t | |
JOIN letterpairs_enumerated i | |
ON t.letterpair = i.letterpair | |
AND t.letteridx = i.letteridx | |
), ordered AS ( | |
SELECT | |
customer_no, | |
score = CAST(ROUND(score, 0) AS int), | |
criteria = name, | |
[order] = ROW_NUMBER() OVER (PARTITION BY customer_no ORDER BY score DESC) | |
FROM scored | |
WHERE score >= 45 | |
) | |
SELECT | |
customer_no, | |
score, | |
criteria | |
FROM ordered WHERE [order] = 1; | |
GO |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment