Bergvca · April 4, 2018 11:51
diff --git a/README.MD b/README.MD
diff --git a/Name matching in SQL Server.sql b/Name matching in SQL Server.sql
 -- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein
 --
 -- Input: cleaned_table: a table with "cleaned" names
 -- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match. 


 DROP TABLE #matches
 SELECT  a.clean_Name, 
        a.uid,
        b.clean_Name clean_name_2, 
        b.uind uid_2,
        dbo.calcJaroWinkler(a.Clean_Name, b.Clean_Name) JWscore, 
        dbo.CalcLevenshtein(a.Clean_Name, b.Clean_Name) LevenstheinScore,
        dbo.calcJaroWinkler(a.First_Word, b.First_Word) JWscore_First_Word, 
        dbo.CalcLevenshtein(a.First_Word, b.First_Word) LevenstheinScore_First_Word,
        a.Nr_of_Words  Nr_of_Words_1,
        b.Nr_of_Words Nr_of_Words_2
 INTO #matches
 FROM #cleaned_table A
 JOIN #cleaned_table B
 ON (
      a.Clean_Name = b.Clean_Name
 	    OR
 	 (
 		  dbo.calcLevenshtein(a.Clean_Name, b.Clean_Name) > 90
 		  OR [dbo].calcJaroWinkler(a.Clean_Name, b.Clean_Name) > 0.9
 		)
 	)
 	AND a.uid != b.uid;
  
 -- For each UID, create a group to which it belongs. This "Group_ID" is the minimum UID of all the matches this name has.
 DROP TABLE TMP_GROUPS;
 SELECT CASE WHEN uid < min(uid_2) 
 			THEN uid 
 			ELSE min(uid_2)
 		END AS Group_ID,
 		uid
 INTO TMP_GROUPS
 FROM #matches
 GROUP BY uid; 

 -- The previous step will, in some cases, generate "orphaned items". E.g. in the case when A is matched with B, and B is matched with C, 
 -- but C is not matched with A. In this case C will have to be matched with the group A. 
 DECLARE @Number_of_Lone_Groups INT = (SELECT COUNT(1)
 					FROM #matches
 					LEFT JOIN TMP_GROUPS Left
 					ON #matches.uid = Left.uid
 					LEFT JOIN TMP_GROUPS Right
 					ON #matches.[uid_2] = Right.UID
 					WHERE Left.Group_ID != Right.Group_ID) 
 	
 --While these orphans exist, find their parents, and update them with the correct Group ID:
 WHILE @Number_of_Lone_Groups != 0 BEGIN
 	UPDATE Left
 		SET Left.Group_ID =	
 			 CASE WHEN Left.Group_ID > Right.Group_Id
 				THEN Right.Group_Id
 				ELSE Left.Group_ID
 			END
 		FROM #matches
 		LEFT JOIN TMP_GROUPS Left
 		ON #matches.[Organization_id] = Left.organization_id
 		LEFT JOIN TMP_GROUPS Right
 		ON #matches.[organization_id_2] = Right.organization_id
 		WHERE Left.Group_ID != Right.Group_ID


 	UPDATE Right
 		SET	Right.Group_ID =
 			CASE WHEN Left.Group_ID < Right.Group_Id
 				 THEN Left.Group_Id
 				 ELSE Right.Group_ID
 			END
 		FROM #matches
 		LEFT JOIN TMP_GROUPS Left
 		ON #matches.uid = Left.uid
 		LEFT JOIN TMP_GROUPS Right
 		ON #matches.uid_2 = Right.uid
 		WHERE Left.Group_ID != Right.Group_ID

 	SET @Number_of_Lone_Groups = (SELECT COUNT(1)
 									FROM #matches
 									LEFT JOIN TMP_GROUPS Left
 									ON #matches.[Organization_id] = Left.organization_id
 									LEFT JOIN TMP_GROUPS Right
 									ON #matches.[organization_id_2] = Right.organization_id
 									WHERE Left.Group_ID != Right.Group_ID)
 END
	-- First create matches using a UDF, here I am using a combination of Jaro Winkler and (a normalized version of) Levensthein
	--
	-- Input: cleaned_table: a table with "cleaned" names
	-- Output: tmp_groups: a table with uid - group_id tuples. Each group_id contains all uid's that belong to names that match.


	DROP TABLE #matches
	SELECT a.clean_Name,
	a.uid,
	b.clean_Name clean_name_2,
	b.uind uid_2,
	dbo.calcJaroWinkler(a.Clean_Name, b.Clean_Name) JWscore,
	dbo.CalcLevenshtein(a.Clean_Name, b.Clean_Name) LevenstheinScore,
	dbo.calcJaroWinkler(a.First_Word, b.First_Word) JWscore_First_Word,
	dbo.CalcLevenshtein(a.First_Word, b.First_Word) LevenstheinScore_First_Word,
	a.Nr_of_Words Nr_of_Words_1,
	b.Nr_of_Words Nr_of_Words_2
	INTO #matches
	FROM #cleaned_table A
	JOIN #cleaned_table B
	ON (
	a.Clean_Name = b.Clean_Name
	OR
	(
	dbo.calcLevenshtein(a.Clean_Name, b.Clean_Name) > 90
	OR [dbo].calcJaroWinkler(a.Clean_Name, b.Clean_Name) > 0.9
	)
	)
	AND a.uid != b.uid;

	-- For each UID, create a group to which it belongs. This "Group_ID" is the minimum UID of all the matches this name has.
	DROP TABLE TMP_GROUPS;
	SELECT CASE WHEN uid < min(uid_2)
	THEN uid
	ELSE min(uid_2)
	END AS Group_ID,
	uid
	INTO TMP_GROUPS
	FROM #matches
	GROUP BY uid;

	-- The previous step will, in some cases, generate "orphaned items". E.g. in the case when A is matched with B, and B is matched with C,
	-- but C is not matched with A. In this case C will have to be matched with the group A.
	DECLARE @Number_of_Lone_Groups INT = (SELECT COUNT(1)
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.uid = Left.uid
	LEFT JOIN TMP_GROUPS Right
	ON #matches.[uid_2] = Right.UID
	WHERE Left.Group_ID != Right.Group_ID)

	--While these orphans exist, find their parents, and update them with the correct Group ID:
	WHILE @Number_of_Lone_Groups != 0 BEGIN
	UPDATE Left
	SET Left.Group_ID =
	CASE WHEN Left.Group_ID > Right.Group_Id
	THEN Right.Group_Id
	ELSE Left.Group_ID
	END
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.[Organization_id] = Left.organization_id
	LEFT JOIN TMP_GROUPS Right
	ON #matches.[organization_id_2] = Right.organization_id
	WHERE Left.Group_ID != Right.Group_ID


	UPDATE Right
	SET Right.Group_ID =
	CASE WHEN Left.Group_ID < Right.Group_Id
	THEN Left.Group_Id
	ELSE Right.Group_ID
	END
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.uid = Left.uid
	LEFT JOIN TMP_GROUPS Right
	ON #matches.uid_2 = Right.uid
	WHERE Left.Group_ID != Right.Group_ID

	SET @Number_of_Lone_Groups = (SELECT COUNT(1)
	FROM #matches
	LEFT JOIN TMP_GROUPS Left
	ON #matches.[Organization_id] = Left.organization_id
	LEFT JOIN TMP_GROUPS Right
	ON #matches.[organization_id_2] = Right.organization_id
	WHERE Left.Group_ID != Right.Group_ID)
	END