Skip to content

Instantly share code, notes, and snippets.

@dehaenw
Created July 14, 2025 10:03
Show Gist options
  • Save dehaenw/05f5a895756ddaa1c3f1e3d18bd26fa1 to your computer and use it in GitHub Desktop.
Save dehaenw/05f5a895756ddaa1c3f1e3d18bd26fa1 to your computer and use it in GitHub Desktop.
comparing morgan counts vs morgan simulated counts in toy tasks
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import Descriptors
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from matplotlib import pyplot as plt
RDLogger.DisableLog('rdApp.*')
MFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
MFPGEN_CS = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048,countSimulation=True)
#100K random chembl smiles. mirrored here https://www.dropbox.com/scl/fi/zvd0lnyhdd8vfvf392dpw/chembl100K.csv?rlkey=q3vneuzix79qnd8g4wemur3q7&st=plf81rfu&dl=0
mols = [m for m in Chem.SmilesMolSupplier("chembl100K.csv") if m]
X_binary = [MFPGEN.GetFingerprintAsNumPy(m) for m in mols]
X_counts = [MFPGEN.GetCountFingerprintAsNumPy(m) for m in mols]
X_simcounts = [MFPGEN_CS.GetFingerprintAsNumPy(m) for m in mols]
y = [Descriptors.MolWt(m) for m in mols]
fig, axs = plt.subplots(2, 3, figsize=(15, 8))
#Random split
for i,X in enumerate([X_binary,X_counts,X_simcounts]):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
rgr = Ridge()
rgr.fit(X_train,y_train)
y_pred = rgr.predict(X_test)
axs[0][i].scatter(y_train,rgr.predict(X_train),s=0.01,label="Train")
axs[0][i].scatter(y_test,y_pred,s=0.01,label="Test")
axs[0][i].set_xlabel("MW (actual)")
axs[0][i].set_ylabel("MW (predicted)")
axs[0][i].set_title(f"Ridge regression model with test r2_score {round(r2_score(y_test,y_pred),3)}")
axs[0][i].legend(markerscale=100)
#Property split
for i,X in enumerate([X_binary,X_counts,X_simcounts]):
test_size = 0.1
y_sorted, X_sorted = list(zip(*sorted(list(zip(y, X)),key=lambda x:x[0])))
X_train = X_sorted[:int((1-test_size)*len(X))]
X_test = X_sorted[int((1-test_size)*len(X)):]
y_train = y_sorted[:int((1-test_size)*len(y))]
y_test = y_sorted[int((1-test_size)*len(y)):]
rgr = Ridge()
rgr.fit(X_train,y_train)
y_pred = rgr.predict(X_test)
axs[1][i].scatter(y_train,rgr.predict(X_train),s=0.01,label="Train")
axs[1][i].scatter(y_test,y_pred,s=0.01,label="Test")
axs[1][i].set_xlabel("MW (actual)")
axs[1][i].set_ylabel("MW (predicted)")
axs[1][i].set_title(f"Ridge regression model with test r2_score {round(r2_score(y_test,y_pred),3)}")
axs[1][i].legend(markerscale=100)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment