Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Last active February 20, 2019 01:05
Show Gist options
  • Save PatWalters/432c369ff33767b800faa99efcbe2fef to your computer and use it in GitHub Desktop.
Save PatWalters/432c369ff33767b800faa99efcbe2fef to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import math
def in_place_rmsd(mol_1, mol_2):
tmp_mol = Chem.RemoveHs(mol_2)
conf_1 = mol_1.GetConformer()
conf_2 = mol_2.GetConformer()
heavy_atom_idx = [x.GetIdx() for x in mol_1.GetAtoms() if x.GetAtomicNum() > 1]
ref_coords = [conf_1.GetAtomPosition(x) for x in heavy_atom_idx]
matches = mol_1.GetSubstructMatches(tmp_mol, uniquify=False)
rms_list = []
for m in matches:
fit_coords = [conf_2.GetAtomPosition(x) for x in m]
delta_list = [(x - y).LengthSq() for x, y in zip(ref_coords, fit_coords)]
max_dev = math.sqrt(max(delta_list))
rms_list.append([math.sqrt(sum(delta_list) / len(delta_list)),max_dev])
rms_list.sort()
return rms_list[0]
def pairwise_rmsd(mol_list, rms_cutoff, dev_cutoff, in_place):
is_dupe = [False] * len(mol_list)
for i in range(0, len(mol_list)):
for j in range(0, i):
in_place_rmsd(mol_list[i], mol_list[j])
rms, max_dev = in_place_rmsd(mol_list[i], mol_list[j])
if rms < rms_cutoff and max_dev < dev_cutoff:
is_dupe[i] = True
break
return is_dupe
def remove_dupe_confs(infile, outfile, rms_cutoff, dev_cutoff, in_place):
suppl = Chem.SDMolSupplier(infile, removeHs=False)
writer = Chem.SDWriter(outfile)
data = []
for mol in tqdm(suppl, desc="Reading"):
name = mol.GetProp("_Name")
data.append([name, mol])
df = pd.DataFrame(data, columns=["Name", "Mol"])
initial_rows, _ = df.shape
dupe_list = []
for k, v in tqdm(df.groupby("Name"), desc="Removing Dupes"):
dupe_list += pairwise_rmsd(v["Mol"].values, rms_cutoff, dev_cutoff, in_place)
df["dupe"] = dupe_list
df = df.query("dupe == False")
final_rows, _ = df.shape
for mol in df.Mol.values:
writer.write(mol)
print("read %d conformers, wrote %d conformers" % (initial_rows, final_rows))
if __name__ == "__main__":
if len(sys.argv) != 5:
print("usage: %s infile.sdf outfile.sdf rms_cutoff max_deviation" % (sys.argv[0]))
else:
remove_dupe_confs(sys.argv[1], sys.argv[2], float(sys.argv[3]), float(sys.argv[4]), in_place=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment