Created
November 15, 2021 11:46
-
-
Save jamal919/f80ae4010dc866f41144482fa06eade8 to your computer and use it in GitHub Desktop.
Outlier detection based on modified tau
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Outlier detection based on modified tau | |
author: khan | |
""" | |
import numpy as np | |
from scipy.stats import t | |
# tau table | |
nsample = np.arange(3, 21) # depending on sample size, t-distribution will vary | |
alpha = 0.05 # 95 percent confidence interval | |
t_alpha = t.ppf(1-alpha/2, nsample-2) # 2 degree of freedom | |
# Threshold for the modified thompson tau | |
threshold = t_alpha*(nsample-1)/(np.sqrt(nsample)*np.sqrt(nsample-2+t_alpha**2)) | |
print('nsample, t_alpha, threshold') | |
print(np.array([nsample, t_alpha, threshold]).T) | |
# Test | |
# Based on the example shown in https://www.statisticshowto.com/modified-thompson-tau-test/ | |
samples = np.array([489, 490, 490, 491, 494, 499, 499, 500, 501, 505]) | |
print(samples) | |
nsample = len(samples) | |
mu = np.mean(samples) # dof is not considered | |
std = np.std(samples, ddof=1) | |
print(f'sample mean: {mu}, std: {std}') | |
min_max = [np.min(samples), np.max(samples)] | |
imin_max = [np.argmin(samples), np.argmax(samples)] | |
print('min and max', min_max) | |
print('min and max location', imin_max) | |
delta_min_max = np.abs(np.array(min_max)-mu) | |
print(delta_min_max) | |
selected_point = np.max(delta_min_max) | |
iselected_point = np.argmax(delta_min_max) | |
print(f'index of selected point {iselected_point}, delta vaule {selected_point:.2f}, corresponding sample {samples[imin_max[iselected_point]]}') | |
t_alpha = t.ppf(1-alpha/2, nsample-2) | |
tau = t_alpha*(nsample-1)/(np.sqrt(nsample)*np.sqrt(nsample-2+t_alpha**2)) | |
print('tau', tau) | |
threshold = tau*std | |
print('threshold', threshold) | |
if selected_point < threshold: | |
print(f'deviation of the selected point {selected_point:.3f} is smaller than threshold {threshold:.3f}. Not an outlier.') | |
else: | |
print(f'deviation of the selected point {selected_point:.3f} is greater than threshold {threshold:.3f}. Outlier!') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment