ljmartin · May 2, 2025 04:13
diff --git a/expect_max_guess.py b/expect_max_guess.py
 import pandas as pd
 from rdkit import Chem

 from rdkit.Chem import AllChem
 from rdkit.Chem import rdMolDescriptors
 from rdkit.Chem import AllChem, rdMolDescriptors, rdMolAlign, rdMolTransforms

 import numpy as np
 from tqdm.notebook import tqdm

 df = pd.read_csv('./ligs.smi', sep='\t')
 mols = []
 for smi in df['smiles']:
    mols.append(Chem.AddHs(Chem.MolFromSmiles(smi)))
    
    
 nconfs = 50
 for m in mols:
    AllChem.EmbedMultipleConfs(m, nconfs)
    
 #assume the zeroth conformer is the best one to initialize.  
 cids = [0]*len(mols)


 nits = 100 # iterations
 nmols = len(mols)


 # for logging purposes, calculate the total pairwise alignment score:
 score = 0
 for i in range(nmols):
    for j in range(i+1, nmols):
        l = rdMolAlign.GetO3A(mols[i], mols[j], prbCid=cids[i], refCid=cids[j])
        score += l.Score()
 print(score)

 # for each iteration we'll pick a random molecule, and update it's conformer Id
 # to the best alignment over all other 'best' compounds. 
 # the 'best' conformer ID is initialized as zero, but after each iteration
 # we expect the conformers to converge towards the single best alignment. 
 for i in tqdm(range(nits)):
    #ref molindex:
    rmid = np.random.choice(range(nmols))
    
    # create a matrix of shape nmols,nconfs. this will store scores.
    # once it's full of alignment scores for all conformers of 
    # this molecule vs all the other 'best' conformers, we'll update this molecule's best conformer ID
    # to give the maximum sum of alignment scores. 
    smat = np.zeros([nmols, nconfs])
    
    #test moleculec index
    for tmid in range(nmols):
        if tmid==rmid:
            continue
            
        #for each conformer of the 'ref' molindex.
        for cid in range(nconfs):
            l = rdMolAlign.GetO3A(mols[tmid], mols[rmid], prbCid=cids[tmid], refCid=cid)
            score = l.Score()
            smat[tmid, cid] = score
    
    # now update rmid's conformer to it's best alignment with all others. 
    scores = smat.sum(0)
    rmax = scores.argmax()
    cids[rmid] = int(rmax)
    
    # for logging purposes, calculate the total alignment score now:
    score = 0
    for i in range(nmols):
        for j in range(i+1, nmols):
            l = rdMolAlign.GetO3A(mols[i], mols[j], prbCid=cids[i], refCid=cids[j])
            score += l.Score()
    print(score)
    
 # use the first molecule as the original 'reference' coordinate set. 
 refmol = Chem.Mol(mols[0])
 refcid = cids[0]

 with Chem.SDWriter('./ligs.sdf') as writer:
    for m, cid in zip(mols, cids):
        
        l = rdMolAlign.GetO3A(m, refmol, prbCid=cid, refCid=refcid)
        l.Align() #update coordinates to alignment. 
        writer.write(m, confId=cid)
	import pandas as pd
	from rdkit import Chem

	from rdkit.Chem import AllChem
	from rdkit.Chem import rdMolDescriptors
	from rdkit.Chem import AllChem, rdMolDescriptors, rdMolAlign, rdMolTransforms

	import numpy as np
	from tqdm.notebook import tqdm

	df = pd.read_csv('./ligs.smi', sep='\t')
	mols = []
	for smi in df['smiles']:
	mols.append(Chem.AddHs(Chem.MolFromSmiles(smi)))


	nconfs = 50
	for m in mols:
	AllChem.EmbedMultipleConfs(m, nconfs)

	#assume the zeroth conformer is the best one to initialize.
	cids = [0]*len(mols)


	nits = 100 # iterations
	nmols = len(mols)


	# for logging purposes, calculate the total pairwise alignment score:
	score = 0
	for i in range(nmols):
	for j in range(i+1, nmols):
	l = rdMolAlign.GetO3A(mols[i], mols[j], prbCid=cids[i], refCid=cids[j])
	score += l.Score()
	print(score)

	# for each iteration we'll pick a random molecule, and update it's conformer Id
	# to the best alignment over all other 'best' compounds.
	# the 'best' conformer ID is initialized as zero, but after each iteration
	# we expect the conformers to converge towards the single best alignment.
	for i in tqdm(range(nits)):
	#ref molindex:
	rmid = np.random.choice(range(nmols))

	# create a matrix of shape nmols,nconfs. this will store scores.
	# once it's full of alignment scores for all conformers of
	# this molecule vs all the other 'best' conformers, we'll update this molecule's best conformer ID
	# to give the maximum sum of alignment scores.
	smat = np.zeros([nmols, nconfs])

	#test moleculec index
	for tmid in range(nmols):
	if tmid==rmid:
	continue

	#for each conformer of the 'ref' molindex.
	for cid in range(nconfs):
	l = rdMolAlign.GetO3A(mols[tmid], mols[rmid], prbCid=cids[tmid], refCid=cid)
	score = l.Score()
	smat[tmid, cid] = score

	# now update rmid's conformer to it's best alignment with all others.
	scores = smat.sum(0)
	rmax = scores.argmax()
	cids[rmid] = int(rmax)

	# for logging purposes, calculate the total alignment score now:
	score = 0
	for i in range(nmols):
	for j in range(i+1, nmols):
	l = rdMolAlign.GetO3A(mols[i], mols[j], prbCid=cids[i], refCid=cids[j])
	score += l.Score()
	print(score)

	# use the first molecule as the original 'reference' coordinate set.
	refmol = Chem.Mol(mols[0])
	refcid = cids[0]

	with Chem.SDWriter('./ligs.sdf') as writer:
	for m, cid in zip(mols, cids):

	l = rdMolAlign.GetO3A(m, refmol, prbCid=cid, refCid=refcid)
	l.Align() #update coordinates to alignment.
	writer.write(m, confId=cid)