Created
April 12, 2023 02:43
-
-
Save SoSeDiK/baf0068151f37aff26700ae48d947726 to your computer and use it in GitHub Desktop.
Обчислювальний інтелект — Модуль 1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from matplotlib.colors import ListedColormap | |
from sklearn import datasets | |
from sklearn.datasets import make_blobs | |
from sklearn.utils import shuffle | |
from sklearn.preprocessing import MinMaxScaler, StandardScaler | |
from sklearn.model_selection import train_test_split | |
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor | |
import matplotlib.pyplot as plt | |
# Reproducible results | |
random_seed = 2023 | |
np.random.seed = random_seed | |
max_k_to_try = 100 | |
steps_between_k = 1 | |
def classification(): | |
# Form data frame | |
df_iris, feature_names = load_iris_dataframe() | |
df_iris = normalize_data(df_iris, feature_names) | |
df_iris = shuffle_data(df_iris) | |
# Split data | |
x_train, x_test, y_train, y_test = train_test_split(df_iris[feature_names], df_iris['label'], | |
random_state=random_seed, train_size=0.75) | |
# Scale for better accuracy | |
x_train, x_test = scale_data(x_train, x_test) | |
# Train KNN classifier | |
k_best, score_best, _ = find_best_k(x_train, x_test, y_train, y_test, True) | |
print('Classifier: The best k = {} , score = {}'.format(k_best, score_best)) | |
def regression(): | |
# Form data frame | |
df_random, feature_names = generate_random_dataframe() | |
df_random = normalize_data(df_random, feature_names) | |
# Split data | |
x_train, x_test, y_train, y_test = train_test_split(df_random[feature_names], df_random['label'], | |
random_state=random_seed, train_size=0.75) | |
# Scale for better accuracy | |
x_train, x_test = scale_data(x_train, x_test) | |
# Train KNN regression | |
k_best, score_best, knnr = find_best_k(x_train, x_test, y_train, y_test, False) | |
print('Regression: The best k = {} , score = {}'.format(k_best, score_best)) | |
# Visualize decision boundary | |
# Find min-max of coordinates | |
x_min, x_max = x_train[:, 0].min() - 0.1, x_train[:, 0].max() + 0.1 | |
y_min, y_max = x_train[:, 1].min() - 0.1, x_train[:, 1].max() + 0.1 | |
# Make a grid using those coordinates | |
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) | |
# Obtain predicted values of each point | |
x_input = np.c_[xx.ravel(), yy.ravel()] | |
y_pred = knnr.predict(x_input) | |
# Apply the same shape for y (required for contour function) | |
y_pred = np.round(y_pred).reshape(xx.shape) | |
# Plot the decision boundary | |
cmap_bold = ListedColormap(['blue', '#FFFF00', 'black', 'green']) | |
plt.figure() | |
plt.contourf(xx, yy, y_pred, cmap=cmap_bold, alpha=0.7) | |
plt.scatter(x_train[:, 0], x_train[:, 1], c=y_train, s=40, cmap=cmap_bold) | |
plt.xlim(xx.min(), xx.max()) | |
plt.ylim(yy.min(), yy.max()) | |
plt.show() | |
def load_iris_dataframe(): | |
# Load iris dataset | |
iris = datasets.load_iris() | |
x_d, y_d, labels, feature_names = iris.data, iris.target, iris.target_names, iris.feature_names | |
# Form data frame | |
df_iris = pd.DataFrame(x_d, columns=feature_names) | |
df_iris['label'] = y_d | |
features_dict = {k: v for k, v in enumerate(labels)} | |
df_iris['label_names'] = df_iris.label.apply(lambda x: features_dict[x]) | |
return df_iris, feature_names | |
def generate_random_dataframe(): | |
feature_names = ['x0', 'x1'] | |
x_d, y_d = make_blobs(n_samples=1000, n_features=len(feature_names), centers=8, cluster_std=1.3, | |
random_state=random_seed) | |
y_d = y_d % 2 | |
plt.figure() | |
plt.title('Sample binary classification problem with non-linearly separable classes') | |
cmap_bold = ListedColormap(['blue', '#FFFF00', 'black', 'green']) | |
plt.scatter(x_d[:, 0], x_d[:, 1], c=y_d, | |
marker='o', s=30, cmap=cmap_bold) | |
df_random = pd.DataFrame(x_d, columns=feature_names) | |
df_random['label'] = y_d | |
return df_random, feature_names | |
def normalize_data(dataframe, feature_names): | |
min_max_scaler = MinMaxScaler() | |
dataframe[feature_names] = min_max_scaler.fit_transform(dataframe[feature_names]) | |
return dataframe | |
def shuffle_data(dataframe): | |
dataframe = shuffle(dataframe, random_state=random_seed) | |
return dataframe | |
def scale_data(x_train, x_test): | |
scaler = StandardScaler() | |
x_train = scaler.fit_transform(x_train) | |
x_test = scaler.transform(x_test) | |
return x_train, x_test | |
def find_best_k(x_train, x_test, y_train, y_test, use_classifier): | |
k_best = 1 | |
score_best = 0 | |
knn = None | |
for k in range(1, max_k_to_try, steps_between_k): | |
knn = KNeighborsClassifier(n_neighbors=k) if use_classifier else KNeighborsRegressor(n_neighbors=k) | |
knn = knn.fit(x_train, y_train) | |
score = knn.score(x_test, y_test) | |
if score > score_best: | |
k_best = k | |
score_best = score | |
if score == 1: | |
break | |
return k_best, score_best, knn | |
# Task 1 | |
classification() | |
# Task 2 | |
regression() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment