Created
January 19, 2022 09:07
-
-
Save shivammehta25/d60cc1319af6cdd318b9c7295745108e to your computer and use it in GitHub Desktop.
Helperfile for the second programming assignment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.stats import multivariate_normal | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from collections import namedtuple | |
import random | |
import numpy as np | |
from sklearn.datasets import make_blobs | |
from matplotlib.colors import ListedColormap | |
from math import ceil, floor | |
from matplotlib import cm, gridspec | |
import warnings | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
import pandas as pd | |
from sklearn.mixture import GaussianMixture | |
RANDOM_SEED=1234 | |
random.seed(RANDOM_SEED) | |
np.random.seed(RANDOM_SEED) | |
Point = namedtuple('Point', ['x', 'y']) | |
grid_region_min = -8 | |
grid_region_max = 8 | |
sup_title = "Problem Setup" | |
x_label = "x1" | |
y_label = "x2" | |
z_label = "Probability" | |
fig_title_2D = "Distributions in feature space (2D)" | |
fig_title_3D = "Distributions in feature space (3D)" | |
def assert_between(p): | |
if not (grid_region_min < p.x < grid_region_max): | |
warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max)) | |
p = Point(grid_region_min if p.x < 0 else grid_region_max, p.y) | |
if not (grid_region_min < p.y < grid_region_max): | |
warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max)) | |
p = Point(p.x, grid_region_min if p.y < 0 else grid_region_max) | |
return p | |
def generate_data_space(): | |
x = np.linspace(grid_region_min, grid_region_max, 1000) | |
y = np.linspace(grid_region_min, grid_region_max, 1000) | |
xx, yy = np.meshgrid(x, y) | |
pos = np.dstack((xx, yy)) | |
return x, y, xx, yy, pos | |
def problem_1_data(scaling_factor, pos): | |
# Two Gaussians | |
mu1 = np.array([-2, -2]) | |
mu3 = np.array([2, 2]) | |
sigma = np.array([3, 3]) | |
z1 = multivariate_normal(mu1, sigma).pdf(pos) * scaling_factor[0] | |
z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1] | |
peak1, peak2 = np.max(z1), np.max(z2) | |
return z1, z2, (mu1, [], mu3), sigma, peak1, peak2 | |
def problem_2_data(scaling_factor, pos): | |
# 0 is mixture of gaussians now | |
# 0 gaussian | |
mu1 = np.array([0, -3.5]) | |
mu2 = np.array([-3.5, 0]) | |
sigma = np.array([3, 3]) | |
n_components = 2 | |
weights = np.ones(n_components, dtype=np.float32) / n_components | |
z01 = multivariate_normal(mu1, sigma).pdf(pos) * (scaling_factor[0] * weights[0]) | |
z02 = multivariate_normal(mu2, sigma).pdf(pos) * (scaling_factor[0] * weights[1]) | |
z1 = z01 + z02 | |
# 1 gaussian | |
mu3 = np.array([1.5, 1.5]) | |
sigma = np.array([3, 3]) | |
z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1] | |
peak1, peak2 = np.max(z1), np.max(z2) | |
return z1, z2, (mu1, mu2, mu3), sigma, peak1, peak2 | |
def plot_feature_space(p1=None, p2=None, figsize=(10, 4), scaling_factor=(1, 1), D3=False, optimal_boundary=False, cost_function=None, problem_v=1): | |
decision_boundary = False | |
if (p1 and p2): | |
decision_boundary = True | |
if not ((isinstance(p1, tuple) and len(p1) == 2) and (isinstance(p2, tuple) and len(p2) == 2)): | |
raise ValueError("P1 and P2 should either be of type Point or tuple of len 2") | |
p1 = assert_between(Point(p1[0], p1[1])) | |
p2 = assert_between(Point(p2[0], p2[1])) | |
x, y, xx, yy, pos = generate_data_space() | |
if problem_v == 1: | |
data_function = problem_1_data | |
else: | |
data_function = problem_2_data | |
z1, z2, mu, sigma, peak1, peak2 = data_function(scaling_factor, pos) | |
fig = plt.figure(figsize=figsize) | |
plt.suptitle(sup_title) | |
gs = gridspec.GridSpec(1, 2) | |
# Plotting in 3D | |
if D3: | |
ax1 = fig.add_subplot(gs[0], projection='3d') | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
ax1.plot_surface(xx, yy, z1, cmap=cm.coolwarm, | |
linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z1), vmax=np.nanmax(z1)) | |
ax1.plot_surface(xx, yy, z2, cmap=cm.Spectral, | |
linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z2), vmax=np.nanmax(z2)) | |
mu1 = mu[0] | |
mu2 = mu[-1] | |
ax1.text(mu1[0], mu1[1], peak1, "class 0", (1, 1,0)) | |
ax1.text(mu2[0], mu2[1], peak2, "class 1", (1, 1,0)) | |
ax1.set_title(fig_title_3D) | |
ax1.set_xlabel(x_label) | |
ax1.set_ylabel(y_label) | |
ax1.set_zlabel(z_label) | |
# Plotting in 2D | |
ax2 = fig.add_subplot(gs[1]) | |
else: | |
ax2 = fig.add_subplot(gs[0]) | |
ax2.contour(xx, yy, z1, cmap=cm.coolwarm) | |
ax2.contour(xx, yy, z2, cmap=cm.Spectral) | |
if decision_boundary: | |
r1, r2 = xx.flatten(), yy.flatten() | |
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1)) | |
grid = np.hstack((r1,r2)) | |
yhat = np.where(((p2.x - p1.x)*( grid[:, 1] - p1.y) - (p2.y - p1.y)*(grid[:, 0] - p1.x)) > 0, 1, 0) | |
zz = yhat.reshape(xx.shape) | |
ax2.contourf(xx, yy, zz, cmap=ListedColormap(['tab:blue', 'tab:orange']), alpha=0.2) | |
ax2.axline((p1.x, p1.y), (p2.x, p2.y), c="k") | |
ax2.scatter(p1.x, p1.y, marker='X', s=500, c='k') | |
ax2.scatter(p2.x, p2.y, marker='X', s=500, c='k') | |
mu1 = mu[0] | |
mu2 = mu[-1] | |
ax2.text(mu1[0], mu1[1], "0", color="b") | |
ax2.text(mu2[0], mu2[1], "1", color="k") | |
ax2.set_title(fig_title_2D) | |
ax2.set_xlabel(x_label) | |
ax2.set_ylabel(y_label) | |
# Monte Carlo estimation of values | |
if decision_boundary: | |
n = 5000 | |
r1 = scaling_factor[0] / sum(scaling_factor) | |
r2 = scaling_factor[1] / sum(scaling_factor) | |
if problem_v == 1: | |
mu1, mu2 = mu[0], mu[-1] | |
X, y = make_blobs(n_samples=[floor(n * r1), ceil(n * r2)], centers=[mu1, mu2], cluster_std=sigma, random_state=RANDOM_SEED) | |
else: | |
mu1, mu2, mu3 = mu[0], mu[1], mu[-1] | |
X, y = make_blobs(n_samples=[floor(n * r1 * 0.5), ceil(n * r1 * 0.5), n - floor(n * r1 * 0.5) - ceil(n * r1 * 0.5)], centers=np.array([mu1, mu2, mu3]), cluster_std=[sigma, sigma, sigma], random_state=RANDOM_SEED) | |
y[y == 1] = 0 # Converting the 2nd guassian output to 0 | |
y[y == 2] = 1 # convering the 3rd guassian output to 1 | |
X, y = monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function) | |
if optimal_boundary: | |
boundary = np.where(z2 >= z1, 1, 0) | |
gmm_boundary = ax2.contour(xx, yy, boundary, colors='r') | |
ax2.contourf(xx, yy, boundary, cmap=ListedColormap(['tab:green', 'tab:red']), alpha=0.1) | |
plt.show() | |
def print_metrices(y, prediction): | |
metrics = { | |
"accuracy": round(accuracy_score(y, prediction), 4), | |
"precision": round(precision_score(y, prediction), 4), | |
"recall": round(recall_score(y, prediction), 4), | |
"F1": round(f1_score(y, prediction, average="micro"), 4), | |
"macro F1": round(f1_score(y, prediction, average="macro"), 4) | |
} | |
{print(f"{k}: {v}", end=' | ') for k,v in metrics.items()} | |
print('') | |
def monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function=None, n=5000): | |
prediction = np.where(((p2.x - p1.x)*( X[:, 1] - p1.y) - (p2.y - p1.y)*(X[:, 0] - p1.x)) > 0, 1, 0) | |
TN, FP, FN, TP = confusion_matrix(y, prediction).ravel() | |
print_metrices(y, prediction) | |
if cost_function: | |
cost = cost_function(TN, FP, FN, TP) | |
print(f'Cost: {cost:,.3f}') | |
return X, y | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment