Created
December 31, 2015 20:06
-
-
Save leggitta/2980f835fc62dedc99c5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import t, zscore | |
def grubbs(X, test='two-tailed', alpha=0.05): | |
''' | |
Performs Grubbs' test for outliers recursively until the null hypothesis is | |
true. | |
Parameters | |
---------- | |
X : ndarray | |
A numpy array to be tested for outliers. | |
test : str | |
Describes the types of outliers to look for. Can be 'min' (look for | |
small outliers), 'max' (look for large outliers), or 'two-tailed' (look | |
for both). | |
alpha : float | |
The significance level. | |
Returns | |
------- | |
X : ndarray | |
The original array with outliers removed. | |
outliers : ndarray | |
An array of outliers. | |
''' | |
Z = zscore(X, ddof=1) # Z-score | |
N = len(X) # number of samples | |
# calculate extreme index and the critical t value based on the test | |
if test == 'two-tailed': | |
extreme_ix = lambda Z: np.abs(Z).argmax() | |
t_crit = lambda N: t.isf(alpha / (2.*N), N-2) | |
elif test == 'max': | |
extreme_ix = lambda Z: Z.argmax() | |
t_crit = lambda N: t.isf(alpha / N, N-2) | |
elif test == 'min': | |
extreme_ix = lambda Z: Z.argmin() | |
t_crit = lambda N: t.isf(alpha / N, N-2) | |
else: | |
raise ValueError("Test must be 'min', 'max', or 'two-tailed'") | |
# compute the threshold | |
thresh = lambda N: (N - 1.) / np.sqrt(N) * \ | |
np.sqrt(t_crit(N)**2 / (N - 2 + t_crit(N)**2)) | |
# create array to store outliers | |
outliers = np.array([]) | |
# loop throught the array and remove any outliers | |
while abs(Z[extreme_ix(Z)]) > thresh(N): | |
# update the outliers | |
outliers = np.r_[outliers, X[extreme_ix(Z)]] | |
# remove outlier from array | |
X = np.delete(X, extreme_ix(Z)) | |
# repeat Z score | |
Z = zscore(X, ddof=1) | |
N = len(X) | |
return X, outliers | |
if __name__ == "__main__": | |
# setup some test arrays | |
X = np.arange(-5, 6) | |
X1 = np.r_[X, 100] | |
X2 = np.r_[X, -100] | |
# test the two-tailed case | |
Y, out = grubbs(X1) | |
assert out == 100 | |
Y, out = grubbs(X2) | |
assert out == -100 | |
# test the max case | |
Y, out = grubbs(X1, test='max') | |
assert out == 100 | |
Y, out = grubbs(X2, test='max') | |
assert len(out) == 0 | |
# test the min case | |
Y, out = grubbs(X1, test='min') | |
assert len(out) == 0 | |
Y, out = grubbs(X2, test='min') | |
assert out == -100 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Where can I get the output without the outliers?