Last active
August 29, 2015 14:23
-
-
Save hauselin/35311bc9be789ee44d8f to your computer and use it in GitHub Desktop.
Two R functions to detect and remove outliers using standard-score or MAD method
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#for tutorial, see https://rpubs.com/hauselin/outliersDetect | |
#clean vector using Z-score cut-off method; return a vector with outliers = replace (default is NA) | |
outliersZ <- function(data, zCutOff = 1.96, replace = NA, values = FALSE, digits = 2) { | |
#compute standard deviation (sample version n = n [not n-1]) | |
stdev <- sqrt(sum((data - mean(data, na.rm = T))^2, na.rm = T) / sum(!is.na(data))) | |
#compute absolute Z values for each value | |
absZ <- abs(data - mean(data, na.rm = T)) / stdev | |
#subset data that has absZ greater than the zCutOff and replace them with replace | |
#can also replace with other values (such as max/mean of data) | |
data[absZ > zCutOff] <- replace | |
if (values == TRUE) { | |
return(round(absZ, digits)) #to return absZ values, provide 'absZ' to argument output | |
} else { | |
return(round(data, digits)) #otherwise, return values with outliers = replace | |
} | |
} | |
#clean vector using MAD cut-off method; return a vector with outliers = replace | |
outliersMAD <- function(data, MADCutOff = 2.5, replace = NA, values = FALSE, bConstant = 1.4826, digits = 2) { | |
#compute number of absolute MADs away for each value | |
#formula: abs( ( x - median(x) ) )/ mad(x) | |
absMADAway <- abs((data - median(data, na.rm = T))/mad(data, constant = bConstant, na.rm = T)) | |
#subset data that has absMADAway greater than the MADCutOff and replace them with replace | |
#can also replace values other than replace | |
data[absMADAway > MADCutOff] <- replace | |
if (values == TRUE) { | |
return(round(absMADAway, digits)) #to return absMADAway, provide 'MADAway' to argument output | |
} else { | |
return(round(data, digits)) #otherwise, return values with outliers = replace | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment