Skip to content

Instantly share code, notes, and snippets.

@conorposullivan
Last active April 23, 2023 16:00
Show Gist options
  • Save conorposullivan/7b40c9a84605ed9447ba520a5695a4bd to your computer and use it in GitHub Desktop.
Save conorposullivan/7b40c9a84605ed9447ba520a5695a4bd to your computer and use it in GitHub Desktop.
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
iris = load_iris()
colors = ["blue", "red", "green"]
df = DataFrame(
data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"]
)
target_mapping = {
0: iris["target_names"][0],
1: iris["target_names"][1],
2: iris["target_names"][2],
}
df["target"] = df["target"].map(target_mapping)
X = df[iris["feature_names"]].to_numpy()
labels = df[["target"]].to_numpy()
df.boxplot(by="target", layout=(2, 2), figsize=(10, 10))
plt.savefig("boxplot.png")
X = StandardScaler().fit_transform(X)
cov = (X.T @ X) / (X.shape[0] - 1)
eig_values, eig_vectors = np.linalg.eig(cov)
idx = np.argsort(eig_values, axis=0)[::-1]
sorted_eig_vectors = eig_vectors[:, idx]
cumsum = np.cumsum(eig_values[idx]) / np.sum(eig_values[idx])
xint = range(1, len(cumsum) + 1)
plt.clf()
plt.plot(xint, cumsum)
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.xticks(xint)
plt.xlim(1, 4, 1)
plt.savefig("eig_cumsum.png")
eig_scores = np.dot(X, sorted_eig_vectors[:, :2])
plt.clf()
def biplot(score, coeff, labels):
xs = score[:, 0]
ys = score[:, 1]
n = coeff.shape[0]
for i, u in enumerate(iris["target_names"]):
xi = [
score[j, 0] for j in range(score.shape[0]) if df["target"].tolist()[j] == u
]
yi = [
score[j, 1] for j in range(score.shape[0]) if df["target"].tolist()[j] == u
]
plt.scatter(xi, yi, c=colors[i], label=u)
for i in range(n):
plt.arrow(
0, 0, coeff[i, 0], coeff[i, 1], color="r", head_width=0.05, head_length=0.1
)
plt.text(
coeff[i, 0] * 1.35,
coeff[i, 1] * 1.35,
labels[i],
color="g",
ha="center",
va="center",
)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
biplot(eig_scores, sorted_eig_vectors, iris["feature_names"])
plt.legend()
plt.savefig("biplot_eig.png")
u, s, vh = np.linalg.svd(X)
plt.clf()
svd_scores = np.dot(X, vh.T[:, :2])
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
biplot(svd_scores, vh.T, iris["feature_names"])
plt.legend()
plt.savefig("biplot_svd.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment