Created
April 23, 2022 13:35
-
-
Save ncalm/95e16ac2192b217e931274ac52a8b0a5 to your computer and use it in GitHub Desktop.
This collection of Excel lambda functions provides functionality to quickly apply the standard deviation test on a series of transformations to a continuous variable
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
OUTLIERS | |
These lambda functions allow us to quickly apply the standard deviation test to a series of transformed variables | |
Included in this file: | |
OUTLIER.THRESHOLDS - for calculating outlier thresholds using a standard deviation test | |
inputs: | |
- data, a single column array of a continuous variable | |
- std_devs, the number of standard deviations from the mean of data you want to return as thresholds | |
returns: | |
a single row, two-column array containing the lower and upper thresholds below or above which we might consider the value in data to be an outlier | |
OUTLIER.TEST - for comparing a variable with outlier thresholds and returning flagged outliers | |
inputs: | |
- data - a single column array of a continuous variable, usually transformed via sqrt, ln, log or similar | |
- std_devs - the number of standard deviations from the mean of data you want to return as thresholds, to be passed into OUTLIER.THRESHOLDS | |
- [prefix] - a column header prefix to prepend to each column of the output array | |
returns: | |
A three column array with the ROWS(data) rows and one header row. | |
- column 1 contains the transformed data passed into data | |
- column 2 contains TRUE if the row is outside of the calculated thresholds, FALSE otherwise | |
- column 3 contains either Low or High for rows where column 2 is TRUE | |
OUTLIER.TESTS - for applying OUTLIER.TEST on multiple transformations at once | |
inputs: | |
- data - the single column array of continuous data before applying any transformations | |
- std_devs - the number of standard deviations from the mean of data you want to return as thresholds, to be passed into OUTLIER.TEST and subsequently to OUTLIER.THRESHOLDS | |
- [transforms] - a list of transforms to use in OUTLIER.TEST. The default and allowed values are {"sqrt","ln","log10"} | |
returns: | |
An array with ROWS(data)+1 rows and 1+COLUMNS(transforms)*3 columns, where the first column is the original data and each subsequent group of three columns represents the result of OUTLIER.TEST when called with each of the transformations provided to [transforms] | |
For more details: | |
https://www.flexyourdata.com/blog/excel-lambda-outlier-test-transform-test-and-flag-a-variable-for-outliers-in-excel-with-one-function/ | |
*/ | |
OUTLIER.THRESHOLDS =LAMBDA(data,std_devs, | |
LET( | |
_data,FILTER(data,NOT(ISERROR(data))), | |
_std_devs,std_devs, | |
_mean,AVERAGE(_data), | |
_std_dev,STDEV.S(_data), | |
_lower,_mean-_std_devs*_std_dev, | |
_upper,_mean+_std_devs*_std_dev, | |
CHOOSE({1,2},_lower,_upper) | |
) | |
); | |
OUTLIER.TEST=LAMBDA(data,std_devs,[prefix], | |
LET( | |
_prefix,IF(ISOMITTED(prefix),"test",prefix), | |
_thresholds,OUTLIER.THRESHOLDS(data,std_devs), | |
_is_outlier,IFERROR(((data<INDEX(_thresholds,1,1))+(data>INDEX(_thresholds,1,2)))>0,FALSE), | |
_outlier_type,IFS( | |
data<INDEX(_thresholds,1,1),"Low", | |
data>INDEX(_thresholds,1,2),"High", | |
TRUE,"" | |
), | |
_header,_prefix & {"_data","_is_outlier","_outlier_type"}, | |
_array, | |
MAKEARRAY( | |
ROWS(data)+1, | |
3, | |
LAMBDA(r,c, | |
IF( | |
r=1,INDEX(_header,1,c), | |
CHOOSE( | |
c, | |
INDEX(data,r-1,1), | |
INDEX(_is_outlier,r-1,1), | |
INDEX(_outlier_type,r-1,1) | |
) | |
) | |
) | |
), | |
_array | |
) | |
); | |
OUTLIER.TESTS =LAMBDA(data,std_devs,[transforms], | |
LET( | |
_data,SORT(data), | |
_std_devs,std_devs, | |
_available,{"sqrt","ln","log10"}, | |
_transforms,LET( | |
t,IF(ISOMITTED(transforms),_available,transforms), | |
FILTER(t,(t="sqrt")+(t="ln")+(t="log10")) | |
), | |
_do,IFERROR(XMATCH(_available,_transforms)>0,FALSE), | |
_transformed,CHOOSE({1,2,3},SQRT(_data),LN(_data),LOG(_data,10)), | |
_do_transformed,FILTER(_transformed,_do), | |
_test,LAMBDA(x,y,z,LAMBDA(OUTLIER.TEST(x,y,z))), | |
_tests,MAKEARRAY( | |
1, | |
COLUMNS(_do_transformed), | |
LAMBDA(r,c, | |
_test(INDEX(_do_transformed,,c),_std_devs,INDEX(_transforms,1,c)) | |
) | |
), | |
_cols,1+COLUMNS(_tests)*3, | |
_hdata,MAKEARRAY(ROWS(_data)+1,1,LAMBDA(r,c,IF(r=1,"original_data",INDEX(_data,r-1,1)))), | |
_t1,INDEX(_tests,1,1)(), | |
_t2,INDEX(_tests,1,2)(), | |
_t3,INDEX(_tests,1,3)(), | |
_array, | |
CHOOSE( | |
SEQUENCE(1,_cols), | |
_hdata, | |
INDEX(_t1,,1), | |
INDEX(_t1,,2), | |
INDEX(_t1,,3), | |
INDEX(_t2,,1), | |
INDEX(_t2,,2), | |
INDEX(_t2,,3), | |
INDEX(_t3,,1), | |
INDEX(_t3,,2), | |
INDEX(_t3,,3) | |
), | |
_array | |
) | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment