Created
October 10, 2018 09:40
-
-
Save andersx/5c4dc80c07001c468f057c8b6411b3e1 to your computer and use it in GitHub Desktop.
Kernel PCA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import print_function | |
| import os | |
| import numpy as np | |
| import scipy | |
| from scipy.linalg import svd | |
| from scipy.linalg import eigh | |
| import qml | |
| from qml.kernels import laplacian_kernel | |
| from qml.kernels import gaussian_kernel | |
| from qml.kernels import get_local_kernels_gaussian | |
| from qml.math import cho_solve | |
| import matplotlib | |
| import matplotlib.pyplot as plt | |
| params = {'mathtext.default': 'regular' } | |
| plt.rcParams.update(params) | |
| def stoch(atomtypes): | |
| name = "$" | |
| for atom in ["C", "N", "O", "H"]: | |
| if atom in atomtypes: | |
| n = atomtypes.count(atom) | |
| if n > 1: | |
| name += "%s_{%i}" % (atom, n) | |
| else: | |
| name += "%s" % (atom) | |
| name += "$" | |
| return name | |
| def get_energies(filename): | |
| """ Returns a dictionary with heats of formation for each xyz-file. | |
| """ | |
| f = open(filename, "r") | |
| lines = f.readlines() | |
| f.close() | |
| energies = dict() | |
| for line in lines: | |
| tokens = line.split() | |
| xyz_name = tokens[0] | |
| hof = float(tokens[1]) | |
| energies[xyz_name] = hof | |
| return energies | |
| def test_pca(): | |
| test_dir = os.path.dirname(os.path.realpath(__file__)) | |
| # Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames | |
| data = get_energies(test_dir + "/data/hof_qm7.txt") | |
| # Generate a list of qml.data.Compound() objects | |
| mols = [] | |
| for xyz_file in sorted(data.keys())[:1000]: | |
| # Initialize the qml.data.Compound() objects | |
| mol = qml.data.Compound(xyz=test_dir + "/qm7/" + xyz_file) | |
| # Associate a property (heat of formation) with the object | |
| mol.properties = data[xyz_file] | |
| # This is a Molecular Coulomb matrix sorted by row norm | |
| mol.generate_bob() | |
| # mol.generate_coulomb_matrix(size=23, sorting="row-norm") | |
| # mol.generate_atomic_coulomb_matrix(size=23, sorting="row-norm") | |
| # mol.generate_fchl_representation() | |
| mols.append(mol) | |
| # Shuffle molecules | |
| np.random.seed(666) | |
| np.random.shuffle(mols) | |
| # Make training and test sets | |
| n_test = 99 | |
| n_train = 1000 | |
| training = mols[:n_train] | |
| test = mols[-n_test:] | |
| # List of representations | |
| # X = np.concatenate([mol.representation for mol in training]) | |
| X = np.array([mol.representation for mol in training]) | |
| print(X.shape) | |
| Xs = np.array([mol.representation for mol in test]) | |
| # List of properties | |
| Y = np.array([mol.properties for mol in training]) | |
| Ys = np.array([mol.properties for mol in test]) | |
| # Y = np.array([mol.properties/mol.natoms for mol in training]) | |
| # Ys = np.array([mol.properties/mol.natoms for mol in test]) | |
| N = np.array([mol.natoms for mol in training]) | |
| A = np.array([stoch(mol.atomtypes) for mol in training]) | |
| # Set hyper-parameters | |
| sigma = 26214.40 | |
| llambda = 1e-10 | |
| # Set hyper-parameters | |
| # sigma = 10**(4.2) | |
| # llambda = 10**(-10.0) | |
| # Generate training Kernel | |
| K = laplacian_kernel(X, X, sigma) | |
| # K = get_local_kernels_gaussian(X, X, N, N, [sigma])[0] | |
| # K = qml.fchl.get_local_symmetric_kernels(X, alchemy="off")[0] | |
| # K = qml.fchl.get_global_symmetric_kernels(X)[0] | |
| # K = qml.fchl.get_local_symmetric_kernels(X)[0] | |
| # np.save("K_fchl.npy", K) | |
| # K = np.load("K_fchl.npy") | |
| # Ks = qml.fchl.get_local_kernels(Xs, X)[0] | |
| # K[np.diag_indices_from(K)] += 10e-4 | |
| (w, v) = scipy.linalg.eig(K) | |
| print(v[:,:2]) | |
| print(w[:2]) | |
| X1 = np.array([np.dot(kervec, v[:,0]) for kervec in K]) | |
| X2 = np.array([np.dot(kervec, v[:,1]) for kervec in K]) | |
| X3 = np.array([np.dot(kervec, v[:,2]) for kervec in K]) | |
| # X1s = np.array([np.dot(kervec, v[:,0]) for kervec in Ks]) | |
| # X2s = np.array([np.dot(kervec, v[:,1]) for kervec in Ks]) | |
| X1 -= X1.mean() | |
| X2 -= X2.mean() | |
| X3 -= X3.mean() | |
| X1 /= X1.std() | |
| X2 /= X2.std() | |
| X3 /= X3.std() | |
| # X1s -= X1s.mean() | |
| # X2s -= X2s.mean() | |
| # X1s /= X1s.std() | |
| # X2s /= X2s.std() | |
| sort_args = np.argsort(X1) | |
| for lol in zip(X1[sort_args], X2[sort_args], N[sort_args], A[sort_args]): | |
| print(lol) | |
| z = Y | |
| # z = X3 | |
| cmap = matplotlib.cm.get_cmap('viridis') | |
| normalize = matplotlib.colors.Normalize(vmin=min(z), vmax=max(z)) | |
| colors = [cmap(normalize(value)) for value in z] | |
| fig, ax = plt.subplots(figsize=(8,4.5)) | |
| # plt.subplot(1,2,1) | |
| labels = [] | |
| for i, txt in enumerate(A): | |
| if txt not in labels: | |
| ax.annotate(txt, (X1[i], X2[i]), fontsize=6) | |
| labels.append(txt) | |
| plt.scatter(X1, X2, color=colors) | |
| # plt.subplot(1,2,2) | |
| # plt.scatter(X1s, X2s, color=colors) | |
| # plt.title("FCHL kernel PCA") | |
| # plt.title("Local Coulomb matrix kernel PCA") | |
| # plt.title("Coulomb matrix kernel PCA") | |
| plt.title("Bag-of-Bonds (BoB) kernel PCA") | |
| plt.xlabel("First kernel principal component") | |
| plt.ylabel("Second kernel principal component") | |
| # Optionally add a colorbar | |
| cax, _ = matplotlib.colorbar.make_axes(ax) | |
| cbar = matplotlib.colorbar.ColorbarBase(cax, cmap=cmap, norm=normalize) | |
| cbar.set_label('Atomization energy [kcal/mol] ', rotation=270, labelpad=15) | |
| # cbar.set_label('# Number of atoms', rotation=270) | |
| # plt.text(3.5, 0.5, "# Number of atoms", ha="center", va="center", rotation=270) | |
| # plt.text(5.2, 0.5, "Atomization energy [kcal/mol]", ha="center", va="center", rotation=270) | |
| # cbar = plt.colorbar(ax=ax) | |
| # clb = plt.colorbar() | |
| # clb.set_label('label', labelpad=-40, y=1.05, rotation=0) | |
| # plt.tight_layout() | |
| plt.savefig("scatter_test.png", dpi=200) | |
| plt.savefig("scatter_test.pdf") | |
| plt.savefig("scatter_test.svg") | |
| print("end") | |
| if __name__ == "__main__": | |
| test_pca() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment