maria-aguilera · November 29, 2022 14:59
diff --git a/outlier_detect_1.py b/outlier_detect_1.py
 from sklearn.datasets import load_wine
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 data = pd.DataFrame(load_wine()["data"],columns=load_wine()["feature_names"])
 data.head()
diff --git a/outlier_detect_2.py b/outlier_detect_2.py
 data.plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20));
diff --git a/outlier_detect_3.py b/outlier_detect_3.py
 #FUNCTION TO IDENTIFY OUTLIERS USING IQR METHOD
 def iqr_outlier(x,factor):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    min_ = q1 - factor * iqr
    max_ = q3 + factor * iqr
    result_ = pd.Series([0] * len(x))
    result_[((x < min_) | (x > max_))] = 1
    return result_
 #SCATTER PLOTS HIGHLIGHTING OUTLIERS CALCULATED USING IQR METHOD
 fig, ax = plt.subplots(7, 2, figsize=(20, 30))
 row = col = 0
 for n,i in enumerate(data.columns):
    if (n % 2 == 0) & (n > 0):
        row += 1
        col = 0
    outliers = iqr_outlier(data[i], 1.5)
   
    if sum(outliers) == 0:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
    else:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
    for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
        ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
    ax[row,col].set_ylabel("")
    ax[row,col].set_title(i)
    ax[row,col].xaxis.set_visible(False)
    if sum(outliers) > 0:
        ax[row,col].legend(ncol=2)
    col += 1
 ax[row,col].axis('off')
 plt.show()
diff --git a/outlier_detect_4.py b/outlier_detect_4.py
 #FUNCTION TO DETECT OUTLIERS USING Z-SCORE METHOD
 def zscore_outlier(x,lb,ub):
    zscore = ((x - x.mean()) / x.std()).copy()
    result_ = pd.Series([0] * len(x))
    result_[((zscore < lb) | (zscore > ub))] = 1
    return result_
 #PLOTTING A SCATTER PLOT AND HIGHLIGHTING THE OUTLIERS DETECTED BY Z-SCORE METHOD
 fig, ax = plt.subplots(7, 2, figsize=(20, 30))
 row = col = 0
 for n,i in enumerate(data.columns):
    if (n % 2 == 0) & (n > 0):
        row += 1
        col = 0
    outliers = zscore_outlier(data[i], -3, 3)
   
    if sum(outliers) == 0:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
    else:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
    for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
        ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
    ax[row,col].set_ylabel("")
    ax[row,col].set_title(i)
    ax[row,col].xaxis.set_visible(False)
    if sum(outliers) > 0:
        ax[row,col].legend(ncol=2)
    col += 1
 ax[row,col].axis('off')
 plt.show()
diff --git a/outlier_detect_5.py b/outlier_detect_5.py
 def euclidean_distance_outlier(x,cutoff):
    result_ = pd.Series([0] * len(x))
    data_mean = x.mean() # mean of data
    dist = np.sqrt(np.sum(((x-data_mean) ** 2),axis=1)) #Euclidean distande
    dist_mean = dist.mean() #mean of the distances
    dist_zscore = np.abs((dist - dist_mean) / dist.std())#z-score of the distances
    result_[((dist_zscore > cutoff))] = 1
    return result_
 euc_d = data[["malic_acid","magnesium"]].copy()
 d['outlier'] = euclidean_distance(d,3)
 sns.scatterplot(x="malic_acid",y="magnesium",data=d,hue="outlier",palette=["green","red"])
	from sklearn.datasets import load_wine
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	data = pd.DataFrame(load_wine()["data"],columns=load_wine()["feature_names"])
	data.head()
	#FUNCTION TO IDENTIFY OUTLIERS USING IQR METHOD
	def iqr_outlier(x,factor):
	q1 = x.quantile(0.25)
	q3 = x.quantile(0.75)
	iqr = q3 - q1
	min_ = q1 - factor * iqr
	max_ = q3 + factor * iqr
	result_ = pd.Series([0] * len(x))
	result_[((x < min_) \| (x > max_))] = 1
	return result_
	#SCATTER PLOTS HIGHLIGHTING OUTLIERS CALCULATED USING IQR METHOD
	fig, ax = plt.subplots(7, 2, figsize=(20, 30))
	row = col = 0
	for n,i in enumerate(data.columns):
	if (n % 2 == 0) & (n > 0):
	row += 1
	col = 0
	outliers = iqr_outlier(data[i], 1.5)

	if sum(outliers) == 0:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
	else:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
	for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
	ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
	ax[row,col].set_ylabel("")
	ax[row,col].set_title(i)
	ax[row,col].xaxis.set_visible(False)
	if sum(outliers) > 0:
	ax[row,col].legend(ncol=2)
	col += 1
	ax[row,col].axis('off')
	plt.show()
	#FUNCTION TO DETECT OUTLIERS USING Z-SCORE METHOD
	def zscore_outlier(x,lb,ub):
	zscore = ((x - x.mean()) / x.std()).copy()
	result_ = pd.Series([0] * len(x))
	result_[((zscore < lb) \| (zscore > ub))] = 1
	return result_
	#PLOTTING A SCATTER PLOT AND HIGHLIGHTING THE OUTLIERS DETECTED BY Z-SCORE METHOD
	fig, ax = plt.subplots(7, 2, figsize=(20, 30))
	row = col = 0
	for n,i in enumerate(data.columns):
	if (n % 2 == 0) & (n > 0):
	row += 1
	col = 0
	outliers = zscore_outlier(data[i], -3, 3)

	if sum(outliers) == 0:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
	else:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
	for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
	ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
	ax[row,col].set_ylabel("")
	ax[row,col].set_title(i)
	ax[row,col].xaxis.set_visible(False)
	if sum(outliers) > 0:
	ax[row,col].legend(ncol=2)
	col += 1
	ax[row,col].axis('off')
	plt.show()
	def euclidean_distance_outlier(x,cutoff):
	result_ = pd.Series([0] * len(x))
	data_mean = x.mean() # mean of data
	dist = np.sqrt(np.sum(((x-data_mean) ** 2),axis=1)) #Euclidean distande
	dist_mean = dist.mean() #mean of the distances
	dist_zscore = np.abs((dist - dist_mean) / dist.std())#z-score of the distances
	result_[((dist_zscore > cutoff))] = 1
	return result_
	euc_d = data[["malic_acid","magnesium"]].copy()
	d['outlier'] = euclidean_distance(d,3)
	sns.scatterplot(x="malic_acid",y="magnesium",data=d,hue="outlier",palette=["green","red"])