Last active
April 9, 2021 17:57
-
-
Save Habush/23d8e0b3673eb6aff40b00ddd81b52fb to your computer and use it in GitHub Desktop.
Plot the embedding of SVD components and their projections in the same subpsace. Inspired by https://europepmc.org/article/pmc/pmc5054124
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_emb_projection(X, y=None, ker=tanimoto_v2, alpha=0.5, params=None, annotate=False): | |
""" | |
Plot the row vectors of X and features of X in the same embedded space spanned by PCA Components | |
:param X: the data matrix or dataframe | |
:param y: the target variable (for labelling) | |
:param ker: the kernel function to use | |
:param alpha: the exponent to use for matrix factorization | |
:return: The pca projects of the row vectors and the columns | |
""" | |
# Do SVD Decomposition | |
u, d, v_t = scipy.linalg.svd(X, full_matrices=False) | |
d = np.diag(d) | |
d_1, d_2 = np.power(d, alpha), np.power(d, 1 - alpha) | |
P = u @ d_1 | |
G = v_t.T @ d_2 | |
# Apply the kernel on H | |
if callable(ker): | |
K = kernel_func(G, G, ker=ker) | |
# Transform vectors in G to the feature space of K | |
K_p = kernel_func(P, G, ker=ker) | |
# Apply PCA | |
kpca = KernelPCA(kernel="precomputed") | |
G_pca = kpca.fit_transform(K) | |
P_pca = kpca.transform(K_p) | |
else: | |
kpca = KernelPCA(kernel=ker, **params) | |
G_pca = kpca.fit_transform(G) | |
P_pca = kpca.transform(P) | |
# plot the first two components of G_pca and P_pca on the same plot | |
markers = {"relapse": ".", "genes": "X", "no_relapse": "+"} | |
fig, ax = plt.subplots(1, 1, figsize=(16, 12)) | |
ax.set_xlabel("PC1") | |
ax.set_ylabel("PC2") | |
ax.grid() | |
# Make dataframes for plotting | |
G_pca_df = pd.DataFrame(G_pca, index=X.columns) | |
P_pca_df = pd.DataFrame(P_pca, index=X.index) | |
if y is not None: | |
P_pca_df = P_pca_df.join(y) | |
pos_0_df, pos_1_df = P_pca_df[P_pca_df["posOutcome"] == 0], P_pca_df[P_pca_df["posOutcome"] == 1] | |
# plt_df = pd.concat([pos_0_df.assign(dataset="relapse"), pos_1_df.assign(dataset="no_relapse") ,P_pca_df.assign(dataset="genes")]) | |
# sns.scatterplot(data=plt_df, x=2, y=3, style="dataset", ax=ax, palette=sns.hls_palette(3), hue="dataset") | |
ax.scatter(pos_0_df[0], pos_0_df[1], c='r', marker=6, label="0") | |
ax.scatter(pos_1_df[0], pos_1_df[1], c='g', marker="+", label="1") | |
else: | |
ax.scatter(P_pca[:, 0], P_pca[:, 1], c='g', marker="+", label="Patients") | |
ax.scatter(G_pca_df[0], G_pca_df[1], c='b', marker="x", label="GO/Pathway") | |
if annotate: | |
for i in X.columns.to_list(): | |
x, y = G_pca_df.loc[i][0], G_pca_df.loc[i][1] | |
ax.annotate(i, xy=(x, y), textcoords="offset points") | |
ax.axvline(x=0) | |
ax.axhline(y=0) | |
ax.legend() | |
return G_pca_df, P_pca_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment