Last active
March 22, 2018 17:00
-
-
Save rmalayter/c6bf74cdb369cc5aa92d277b245720de to your computer and use it in GitHub Desktop.
Automated outliter detection (Microsoft SQL Server)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Microsoft T-SQL, tested on SQL Server 2014 */ | |
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; | |
USE tempdb; | |
/*********** CREATE TEST DATA ***********/ | |
/* temporary table for testing with sample calculated metric, could be the result of a WITH common table expression or subquery instead*/ | |
CREATE TABLE #mymetricdata ( | |
key_field INT IDENTITY PRIMARY KEY CLUSTERED | |
,metric FLOAT | |
); | |
/*fill the test table with random data */ | |
WHILE ( | |
SELECT count(1) | |
FROM #mymetricdata | |
) < 100 | |
BEGIN | |
INSERT INTO #mymetricdata (metric) | |
/* get better random numbers than RAND() by using CRYPT_GEN_RANDOM(7) and dividing by 2^56 */ | |
VALUES (CAST(CRYPT_GEN_RANDOM(7) AS BIGINT) / 72057594037927936e0 * 200 - 100); | |
END | |
/*now make some potential outliers, but not all updated here will be outliers due to use of RAND */ | |
UPDATE #mymetricdata | |
SET metric = metric * 10e0 | |
WHERE key_field IN ( | |
SELECT TOP 5 PERCENT key_field | |
FROM #mymetricdata | |
ORDER BY NEWID() | |
); | |
/*********** CALCULATE OUTLIERS ***********/ | |
/* simple median calculation over the metric field */ | |
WITH MED | |
AS ( | |
SELECT TOP 1 median = PERCENTILE_CONT(0.5) WITHIN | |
GROUP ( | |
ORDER BY metric | |
) OVER () | |
FROM #mymetricdata | |
) | |
/* median absolute deviation (MAD) is the median of the differences between each point and the whole-set median calculated above */ | |
,MAD | |
AS ( | |
SELECT TOP 1 mad = PERCENTILE_CONT(0.5) WITHIN | |
GROUP ( | |
ORDER BY ABS(metric - MED.median) | |
) OVER () | |
,MED.median | |
FROM #mymetricdata | |
CROSS JOIN MED --only 1 row | |
) | |
/* the modified Z-score function is pretty simple, and scores > 3.5 are considered outliers | |
from https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm#Z-Scores */ | |
,scores | |
AS ( | |
SELECT key_field | |
,metric | |
/* if MAD is zero, we can't divide by it | |
Z-score is therefore infinity unless that row's | |
absolute deviation from the median is exactly zero. | |
Not likely in real-world data, but possible if | |
more than half the values in the source data are | |
exactly the same */ | |
,ModZscore = CASE | |
WHEN MAD.mad <> CAST(0 AS FLOAT) | |
THEN 0.6745e0 * abs(metric - MAD.median) / MAD.mad | |
ELSE CASE | |
WHEN abs(metric - MAD.median) = CAST(0 AS FLOAT) | |
THEN CAST(0 AS FLOAT) | |
ELSE 1e30 /* SQL doesn't support IEEE-754 infintiy so use huge number as a substitute */ | |
END | |
END | |
,MAD.median | |
,MAD.mad | |
FROM #mymetricdata | |
CROSS JOIN MAD --only 1 row | |
) | |
SELECT key_field | |
,metric | |
,ModZscore | |
,outlier = CASE | |
WHEN ModZscore > 3.5 | |
THEN 1 | |
ELSE 0 | |
END | |
,median | |
,mad | |
FROM scores | |
ORDER BY ModZscore DESC; | |
/*********** CLEANUP TEST DATA ***********/ | |
DROP TABLE #mymetricdata; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment