Skip to content

Instantly share code, notes, and snippets.

@rmalayter
Last active March 22, 2018 17:00
Show Gist options
  • Save rmalayter/c6bf74cdb369cc5aa92d277b245720de to your computer and use it in GitHub Desktop.
Save rmalayter/c6bf74cdb369cc5aa92d277b245720de to your computer and use it in GitHub Desktop.
Automated outliter detection (Microsoft SQL Server)
/* Microsoft T-SQL, tested on SQL Server 2014 */
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
USE tempdb;
/*********** CREATE TEST DATA ***********/
/* temporary table for testing with sample calculated metric, could be the result of a WITH common table expression or subquery instead*/
CREATE TABLE #mymetricdata (
key_field INT IDENTITY PRIMARY KEY CLUSTERED
,metric FLOAT
);
/*fill the test table with random data */
WHILE (
SELECT count(1)
FROM #mymetricdata
) < 100
BEGIN
INSERT INTO #mymetricdata (metric)
/* get better random numbers than RAND() by using CRYPT_GEN_RANDOM(7) and dividing by 2^56 */
VALUES (CAST(CRYPT_GEN_RANDOM(7) AS BIGINT) / 72057594037927936e0 * 200 - 100);
END
/*now make some potential outliers, but not all updated here will be outliers due to use of RAND */
UPDATE #mymetricdata
SET metric = metric * 10e0
WHERE key_field IN (
SELECT TOP 5 PERCENT key_field
FROM #mymetricdata
ORDER BY NEWID()
);
/*********** CALCULATE OUTLIERS ***********/
/* simple median calculation over the metric field */
WITH MED
AS (
SELECT TOP 1 median = PERCENTILE_CONT(0.5) WITHIN
GROUP (
ORDER BY metric
) OVER ()
FROM #mymetricdata
)
/* median absolute deviation (MAD) is the median of the differences between each point and the whole-set median calculated above */
,MAD
AS (
SELECT TOP 1 mad = PERCENTILE_CONT(0.5) WITHIN
GROUP (
ORDER BY ABS(metric - MED.median)
) OVER ()
,MED.median
FROM #mymetricdata
CROSS JOIN MED --only 1 row
)
/* the modified Z-score function is pretty simple, and scores > 3.5 are considered outliers
from https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm#Z-Scores */
,scores
AS (
SELECT key_field
,metric
/* if MAD is zero, we can't divide by it
Z-score is therefore infinity unless that row's
absolute deviation from the median is exactly zero.
Not likely in real-world data, but possible if
more than half the values in the source data are
exactly the same */
,ModZscore = CASE
WHEN MAD.mad <> CAST(0 AS FLOAT)
THEN 0.6745e0 * abs(metric - MAD.median) / MAD.mad
ELSE CASE
WHEN abs(metric - MAD.median) = CAST(0 AS FLOAT)
THEN CAST(0 AS FLOAT)
ELSE 1e30 /* SQL doesn't support IEEE-754 infintiy so use huge number as a substitute */
END
END
,MAD.median
,MAD.mad
FROM #mymetricdata
CROSS JOIN MAD --only 1 row
)
SELECT key_field
,metric
,ModZscore
,outlier = CASE
WHEN ModZscore > 3.5
THEN 1
ELSE 0
END
,median
,mad
FROM scores
ORDER BY ModZscore DESC;
/*********** CLEANUP TEST DATA ***********/
DROP TABLE #mymetricdata;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment