Skip to content

Instantly share code, notes, and snippets.

@andersx
Created October 10, 2018 09:40
Show Gist options
  • Select an option

  • Save andersx/5c4dc80c07001c468f057c8b6411b3e1 to your computer and use it in GitHub Desktop.

Select an option

Save andersx/5c4dc80c07001c468f057c8b6411b3e1 to your computer and use it in GitHub Desktop.
Kernel PCA
from __future__ import print_function
import os
import numpy as np
import scipy
from scipy.linalg import svd
from scipy.linalg import eigh
import qml
from qml.kernels import laplacian_kernel
from qml.kernels import gaussian_kernel
from qml.kernels import get_local_kernels_gaussian
from qml.math import cho_solve
import matplotlib
import matplotlib.pyplot as plt
params = {'mathtext.default': 'regular' }
plt.rcParams.update(params)
def stoch(atomtypes):
name = "$"
for atom in ["C", "N", "O", "H"]:
if atom in atomtypes:
n = atomtypes.count(atom)
if n > 1:
name += "%s_{%i}" % (atom, n)
else:
name += "%s" % (atom)
name += "$"
return name
def get_energies(filename):
""" Returns a dictionary with heats of formation for each xyz-file.
"""
f = open(filename, "r")
lines = f.readlines()
f.close()
energies = dict()
for line in lines:
tokens = line.split()
xyz_name = tokens[0]
hof = float(tokens[1])
energies[xyz_name] = hof
return energies
def test_pca():
test_dir = os.path.dirname(os.path.realpath(__file__))
# Parse file containing PBE0/def2-TZVP heats of formation and xyz filenames
data = get_energies(test_dir + "/data/hof_qm7.txt")
# Generate a list of qml.data.Compound() objects
mols = []
for xyz_file in sorted(data.keys())[:1000]:
# Initialize the qml.data.Compound() objects
mol = qml.data.Compound(xyz=test_dir + "/qm7/" + xyz_file)
# Associate a property (heat of formation) with the object
mol.properties = data[xyz_file]
# This is a Molecular Coulomb matrix sorted by row norm
mol.generate_bob()
# mol.generate_coulomb_matrix(size=23, sorting="row-norm")
# mol.generate_atomic_coulomb_matrix(size=23, sorting="row-norm")
# mol.generate_fchl_representation()
mols.append(mol)
# Shuffle molecules
np.random.seed(666)
np.random.shuffle(mols)
# Make training and test sets
n_test = 99
n_train = 1000
training = mols[:n_train]
test = mols[-n_test:]
# List of representations
# X = np.concatenate([mol.representation for mol in training])
X = np.array([mol.representation for mol in training])
print(X.shape)
Xs = np.array([mol.representation for mol in test])
# List of properties
Y = np.array([mol.properties for mol in training])
Ys = np.array([mol.properties for mol in test])
# Y = np.array([mol.properties/mol.natoms for mol in training])
# Ys = np.array([mol.properties/mol.natoms for mol in test])
N = np.array([mol.natoms for mol in training])
A = np.array([stoch(mol.atomtypes) for mol in training])
# Set hyper-parameters
sigma = 26214.40
llambda = 1e-10
# Set hyper-parameters
# sigma = 10**(4.2)
# llambda = 10**(-10.0)
# Generate training Kernel
K = laplacian_kernel(X, X, sigma)
# K = get_local_kernels_gaussian(X, X, N, N, [sigma])[0]
# K = qml.fchl.get_local_symmetric_kernels(X, alchemy="off")[0]
# K = qml.fchl.get_global_symmetric_kernels(X)[0]
# K = qml.fchl.get_local_symmetric_kernels(X)[0]
# np.save("K_fchl.npy", K)
# K = np.load("K_fchl.npy")
# Ks = qml.fchl.get_local_kernels(Xs, X)[0]
# K[np.diag_indices_from(K)] += 10e-4
(w, v) = scipy.linalg.eig(K)
print(v[:,:2])
print(w[:2])
X1 = np.array([np.dot(kervec, v[:,0]) for kervec in K])
X2 = np.array([np.dot(kervec, v[:,1]) for kervec in K])
X3 = np.array([np.dot(kervec, v[:,2]) for kervec in K])
# X1s = np.array([np.dot(kervec, v[:,0]) for kervec in Ks])
# X2s = np.array([np.dot(kervec, v[:,1]) for kervec in Ks])
X1 -= X1.mean()
X2 -= X2.mean()
X3 -= X3.mean()
X1 /= X1.std()
X2 /= X2.std()
X3 /= X3.std()
# X1s -= X1s.mean()
# X2s -= X2s.mean()
# X1s /= X1s.std()
# X2s /= X2s.std()
sort_args = np.argsort(X1)
for lol in zip(X1[sort_args], X2[sort_args], N[sort_args], A[sort_args]):
print(lol)
z = Y
# z = X3
cmap = matplotlib.cm.get_cmap('viridis')
normalize = matplotlib.colors.Normalize(vmin=min(z), vmax=max(z))
colors = [cmap(normalize(value)) for value in z]
fig, ax = plt.subplots(figsize=(8,4.5))
# plt.subplot(1,2,1)
labels = []
for i, txt in enumerate(A):
if txt not in labels:
ax.annotate(txt, (X1[i], X2[i]), fontsize=6)
labels.append(txt)
plt.scatter(X1, X2, color=colors)
# plt.subplot(1,2,2)
# plt.scatter(X1s, X2s, color=colors)
# plt.title("FCHL kernel PCA")
# plt.title("Local Coulomb matrix kernel PCA")
# plt.title("Coulomb matrix kernel PCA")
plt.title("Bag-of-Bonds (BoB) kernel PCA")
plt.xlabel("First kernel principal component")
plt.ylabel("Second kernel principal component")
# Optionally add a colorbar
cax, _ = matplotlib.colorbar.make_axes(ax)
cbar = matplotlib.colorbar.ColorbarBase(cax, cmap=cmap, norm=normalize)
cbar.set_label('Atomization energy [kcal/mol] ', rotation=270, labelpad=15)
# cbar.set_label('# Number of atoms', rotation=270)
# plt.text(3.5, 0.5, "# Number of atoms", ha="center", va="center", rotation=270)
# plt.text(5.2, 0.5, "Atomization energy [kcal/mol]", ha="center", va="center", rotation=270)
# cbar = plt.colorbar(ax=ax)
# clb = plt.colorbar()
# clb.set_label('label', labelpad=-40, y=1.05, rotation=0)
# plt.tight_layout()
plt.savefig("scatter_test.png", dpi=200)
plt.savefig("scatter_test.pdf")
plt.savefig("scatter_test.svg")
print("end")
if __name__ == "__main__":
test_pca()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment