Created
July 14, 2025 10:03
-
-
Save dehaenw/05f5a895756ddaa1c3f1e3d18bd26fa1 to your computer and use it in GitHub Desktop.
comparing morgan counts vs morgan simulated counts in toy tasks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rdkit import Chem | |
from rdkit import RDLogger | |
from rdkit.Chem import rdFingerprintGenerator | |
from rdkit.Chem import Descriptors | |
from sklearn.linear_model import Ridge | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import r2_score | |
from matplotlib import pyplot as plt | |
RDLogger.DisableLog('rdApp.*') | |
MFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048) | |
MFPGEN_CS = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048,countSimulation=True) | |
#100K random chembl smiles. mirrored here https://www.dropbox.com/scl/fi/zvd0lnyhdd8vfvf392dpw/chembl100K.csv?rlkey=q3vneuzix79qnd8g4wemur3q7&st=plf81rfu&dl=0 | |
mols = [m for m in Chem.SmilesMolSupplier("chembl100K.csv") if m] | |
X_binary = [MFPGEN.GetFingerprintAsNumPy(m) for m in mols] | |
X_counts = [MFPGEN.GetCountFingerprintAsNumPy(m) for m in mols] | |
X_simcounts = [MFPGEN_CS.GetFingerprintAsNumPy(m) for m in mols] | |
y = [Descriptors.MolWt(m) for m in mols] | |
fig, axs = plt.subplots(2, 3, figsize=(15, 8)) | |
#Random split | |
for i,X in enumerate([X_binary,X_counts,X_simcounts]): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) | |
rgr = Ridge() | |
rgr.fit(X_train,y_train) | |
y_pred = rgr.predict(X_test) | |
axs[0][i].scatter(y_train,rgr.predict(X_train),s=0.01,label="Train") | |
axs[0][i].scatter(y_test,y_pred,s=0.01,label="Test") | |
axs[0][i].set_xlabel("MW (actual)") | |
axs[0][i].set_ylabel("MW (predicted)") | |
axs[0][i].set_title(f"Ridge regression model with test r2_score {round(r2_score(y_test,y_pred),3)}") | |
axs[0][i].legend(markerscale=100) | |
#Property split | |
for i,X in enumerate([X_binary,X_counts,X_simcounts]): | |
test_size = 0.1 | |
y_sorted, X_sorted = list(zip(*sorted(list(zip(y, X)),key=lambda x:x[0]))) | |
X_train = X_sorted[:int((1-test_size)*len(X))] | |
X_test = X_sorted[int((1-test_size)*len(X)):] | |
y_train = y_sorted[:int((1-test_size)*len(y))] | |
y_test = y_sorted[int((1-test_size)*len(y)):] | |
rgr = Ridge() | |
rgr.fit(X_train,y_train) | |
y_pred = rgr.predict(X_test) | |
axs[1][i].scatter(y_train,rgr.predict(X_train),s=0.01,label="Train") | |
axs[1][i].scatter(y_test,y_pred,s=0.01,label="Test") | |
axs[1][i].set_xlabel("MW (actual)") | |
axs[1][i].set_ylabel("MW (predicted)") | |
axs[1][i].set_title(f"Ridge regression model with test r2_score {round(r2_score(y_test,y_pred),3)}") | |
axs[1][i].legend(markerscale=100) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment