Skip to content

Instantly share code, notes, and snippets.

@ljmartin
Created November 26, 2023 04:17
Show Gist options
  • Save ljmartin/600b6c74e77717ea71160580b99503d3 to your computer and use it in GitHub Desktop.
Save ljmartin/600b6c74e77717ea71160580b99503d3 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d25c26dc-715e-4083-ab7b-0af4b356ae37",
"metadata": {},
"outputs": [],
"source": [
"from rdkit import Chem\n",
"from rdkit.Chem import AllChem\n",
"from rdkit.Geometry import rdGeometry\n",
"\n",
"import itertools\n",
"import scipy "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4b22d53c-80e1-4af9-bb08-b08c47b72af8",
"metadata": {},
"outputs": [],
"source": [
"def addNTerm(mol, residue, nterm_atom):\n",
" \"\"\"\n",
" Hardcoded addition of n-terminal ACE residue. \n",
" Performs no error checking at all. \n",
" Expects that the existing residue conforms to PDB spec\n",
" for amino acids (in particular, the backbone must be N-C-CA!)\n",
" \n",
" \"\"\"\n",
" \n",
" ## start by getting an actual ACE molecule - \n",
" ## with a two-atom overlap to it's parent.\n",
" namesdict = dict(zip([fetchAtName(at) for at in residue], range(len(residue))))\n",
" #becuase it's N-term, we know this must be an ACE.\n",
" ace = Chem.MolFromSmiles('O=C(C)[N@@H]C')\n",
" ace = Chem.AddHs(ace)\n",
"\n",
" #order is N-C. The N comes first. \n",
" matching_atoms = Chem.MolFromSmarts('[#7D3]-[#6D4]')\n",
" ace.GetSubstructMatch(matching_atoms)\n",
" \n",
" ## embed the molecule in 3D, using the two matching\n",
" ## atoms as constraints.\n",
" #first, generate the constraint dict. \n",
" cmap = {}\n",
" ace_match_atoms = ace.GetSubstructMatch(matching_atoms)\n",
" for idx, name in zip(ace_match_atoms, ['N', 'CA']):\n",
" #cmap[idx] = mol.GetConformer().GetAtomPosition(idx)\n",
" parent_idx = namesdict[name]\n",
" cmap[idx] = mol.GetConformer().GetAtomPosition(parent_idx)\n",
" \n",
" #embed 3d distances to match parent:\n",
" AllChem.EmbedMolecule(ace, coordMap=cmap, enforceChirality=True)\n",
"\n",
" #align (transform and rotate) to the parent residue:\n",
" AllChem.AlignMol(\n",
" ace,\n",
" mol,\n",
" atomMap = list(zip(\n",
" ace_match_atoms, \n",
" [residue[namesdict['N']].GetIdx(), residue[namesdict['CA']].GetIdx()]\n",
" )))\n",
" \n",
" \n",
" ## now comes the joing process. We can remove some superfluos atoms\n",
" ## and the remaining atoms must be added. \n",
" ## note this is all hardcoded - the exact same atoms get added in every ACE,\n",
" ## because we only aligned to the backbone. \n",
" \n",
" ace = Chem.RemoveHs(ace)\n",
" \n",
" #remove the superfluous atoms (i.e. duplicate H, C, N)\n",
" indices = set([i for i in range(ace.GetNumAtoms())])\n",
" indices_to_remove = set(ace_match_atoms)\n",
" indices = list(indices.difference(indices_to_remove))\n",
" \n",
" #find the bond indices so we can do a chem.pathtosubmol\n",
" bonds = []\n",
" for pair in itertools.combinations(indices, 2):\n",
" bond = ace.GetBondBetweenAtoms(pair[0], pair[1])\n",
" if bond is not None:\n",
" #atoms are bonded! keep the\n",
" bonds.append(bond.GetIdx())\n",
" \n",
" #now... we have the ACE atoms that we need to add!\n",
" #turn those into a separate molecule:\n",
" ats_to_add = Chem.PathToSubmol(ace, bonds)\n",
" add_conf = ats_to_add.GetConformer(0)\n",
" \n",
" ## the last step is cumbersome. We individually add each atom \n",
" ## in the ACE residue - alpha carbon, carbonyl O, and terminal methyl\n",
" ## we start by generating a read-write mol, and must set\n",
" ## the PDB info for each added atom, then add a bond, and set the \n",
" ## conformer positions. \n",
" \n",
" rwmol = Chem.RWMol(mol)\n",
" \n",
" ##build out the ACE, bond by bond. \n",
"\n",
" #first the ACE alpha carbon. \n",
" ac = Chem.MolFromSmarts('[$([#6]=[#8])]')\n",
" idx = ats_to_add.GetSubstructMatch(ac)[0]\n",
" at = ats_to_add.GetAtomWithIdx(idx)\n",
"\n",
" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
" newinf.SetResidueName(\"ACE\")\n",
" newinf.SetName(\" C \")\n",
" newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
" newinf.SetChainId('A')\n",
" at.SetPDBResidueInfo(newinf)\n",
"\n",
" alpha_carbon_idx = rwmol.AddAtom(at)\n",
" rwmol.AddBond(alpha_carbon_idx, nterm_atom.GetIdx())\n",
" conformer = rwmol.GetConformer(0)\n",
" conformer.SetAtomPosition(alpha_carbon_idx, add_conf.GetAtomPosition(idx))\n",
"\n",
"\n",
" #fix the carbonyl next. \n",
" carbonyl = Chem.MolFromSmarts(\"[#8D1]\")\n",
" idx = ats_to_add.GetSubstructMatch(carbonyl)[0]\n",
" at = ats_to_add.GetAtomWithIdx(idx)\n",
"\n",
" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
" newinf.SetResidueName(\"ACE\")\n",
" newinf.SetName(\" O \")\n",
" newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
" newinf.SetChainId('A')\n",
" at.SetPDBResidueInfo(newinf)\n",
"\n",
" aidx = rwmol.AddAtom(at)\n",
" rwmol.AddBond(aidx, alpha_carbon_idx)\n",
" conformer = rwmol.GetConformer(0)\n",
" conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
"\n",
"\n",
" #next the ACE beta carbon. \n",
" bc = Chem.MolFromSmarts('[$([#6D1]-[#6D2])]')\n",
" idx = ats_to_add.GetSubstructMatch(bc)[0]\n",
" at = ats_to_add.GetAtomWithIdx(idx)\n",
"\n",
" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
" newinf.SetResidueName(\"ACE\")\n",
" newinf.SetName(\" CH3\")\n",
" newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
" newinf.SetChainId('A')\n",
" at.SetPDBResidueInfo(newinf)\n",
"\n",
" aidx = rwmol.AddAtom(at)\n",
" rwmol.AddBond(aidx, alpha_carbon_idx)\n",
" conformer = rwmol.GetConformer(0)\n",
" conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
"\n",
" return Chem.Mol(rwmol)\n",
"\n",
"def addCTerm(mol, residue, cterm_atom):\n",
" ## start by getting an actual ACE molecule - \n",
" ## with a two-atom overlap to it's parent.\n",
" namesdict = dict(zip([fetchAtName(at) for at in residue], range(len(residue))))\n",
" \n",
" #becuase it's C-term, we know this must be an NME.\n",
" nme = Chem.MolFromSmiles('N(C(C(N))=O)C')\n",
" nme = Chem.AddHs(nme)\n",
" \n",
" matching_atoms = Chem.MolFromSmarts('[#8D1]=[#6D3]-[#6]-[#7]')\n",
" \n",
" \n",
" ## embed the molecule in 3D, using the THREE matching\n",
" ## atoms as constraints.\n",
" #first, generate the constraint dict. \n",
" cmap = {}\n",
" nme_match_atoms = nme.GetSubstructMatch(matching_atoms)\n",
" for idx, name in zip(nme_match_atoms, ['O', 'C', 'CA', 'N']):\n",
" parent_idx = namesdict[name]\n",
" cmap[idx] = mol.GetConformer().GetAtomPosition(parent_idx)\n",
" \n",
" #embed 3d distances to match parent:\n",
" AllChem.EmbedMolecule(nme, coordMap=cmap, )\n",
"\n",
" #align (transform and rotate) to the parent residue:\n",
" AllChem.AlignMol(\n",
" nme,\n",
" mol,\n",
" atomMap = list(zip(\n",
" nme_match_atoms, \n",
" [residue[namesdict[i]].GetIdx() for i in ['O', 'C', 'CA', 'N']]\n",
"\n",
" )))\n",
" \n",
" ## now comes the joing process. We can remove some superfluos atoms\n",
" ## and the remaining atoms must be added. \n",
" ## note this is all hardcoded - the exact same atoms get added in every ACE,\n",
" ## because we only aligned to the backbone. \n",
"\n",
" nme = Chem.RemoveHs(nme)\n",
" \n",
" #remove the superfluous atoms (i.e. duplicate H, C, N)\n",
" indices = set([i for i in range(nme.GetNumAtoms())])\n",
" indices_to_remove = set(nme_match_atoms)\n",
" indices = list(indices.difference(indices_to_remove))\n",
"\n",
" #find the bond indices so we can do a chem.pathtosubmol\n",
" bonds = []\n",
" for pair in itertools.combinations(indices, 2):\n",
" bond = nme.GetBondBetweenAtoms(pair[0], pair[1])\n",
" if bond is not None:\n",
" #atoms are bonded! keep the\n",
" bonds.append(bond.GetIdx())\n",
" \n",
" #now... we have the NME atoms that we need to add!\n",
" #turn those into a separate molecule:\n",
" ats_to_add = Chem.PathToSubmol(nme, bonds)\n",
" add_conf = ats_to_add.GetConformer(0)\n",
" \n",
" ## the last step is cumbersome. We individually add each atom \n",
" ## in the ACE residue - alpha carbon, carbonyl O, and terminal methyl\n",
" ## we start by generating a read-write mol, and must set\n",
" ## the PDB info for each added atom, then add a bond, and set the \n",
" ## conformer positions. \n",
"\n",
" rwmol = Chem.RWMol(mol)\n",
"\n",
" ##build out the NME, bond by bond. \n",
" \n",
" #first, select to-be-added nitrogen.\n",
" nm = Chem.MolFromSmarts('[$([#7]-[#6])]')\n",
" idx = ats_to_add.GetSubstructMatch(nm)[0]\n",
" at = ats_to_add.GetAtomWithIdx(idx)\n",
"\n",
" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
" newinf.SetResidueName(\"NME\")\n",
" newinf.SetName(\" N \")\n",
" newinf.SetResidueNumber(int(fetchResNum(cterm_atom))+1)\n",
" newinf.SetChainId('A')\n",
" at.SetPDBResidueInfo(newinf)\n",
" \n",
" nmethyl_idx = rwmol.AddAtom(at)\n",
" rwmol.AddBond(nmethyl_idx, cterm_atom.GetIdx())\n",
" conformer = rwmol.GetConformer(0)\n",
" conformer.SetAtomPosition(nmethyl_idx, add_conf.GetAtomPosition(idx))\n",
"\n",
"\n",
" #fix the terminal methyl next. \n",
" methyl = Chem.MolFromSmarts(\"[#6]\")\n",
" idx = ats_to_add.GetSubstructMatch(methyl)[0]\n",
" at = ats_to_add.GetAtomWithIdx(idx)\n",
"\n",
" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
" newinf.SetResidueName(\"NME\")\n",
" newinf.SetName(\" C \")\n",
" newinf.SetResidueNumber(int(fetchResNum(cterm_atom))+1)\n",
" newinf.SetChainId('A')\n",
" at.SetPDBResidueInfo(newinf)\n",
"\n",
" aidx = rwmol.AddAtom(at)\n",
" rwmol.AddBond(aidx, nmethyl_idx)\n",
" conformer = rwmol.GetConformer(0)\n",
" conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
"\n",
" return Chem.Mol(rwmol)\n",
"\n",
"def terminalCapsTest(residues):\n",
" for residue in residues:\n",
" resdict = dict(zip([fetchAtName(at) for at in residue] , residue))\n",
"\n",
" nterm_atom = resdict['N']\n",
" cterm_atom = resdict['C']\n",
"\n",
" #check nterminus:\n",
" nterm_neighbours = nterm_atom.GetNeighbors()\n",
" if 'C' not in [fetchAtName(at) for at in nterm_neighbours]:\n",
" print(fetchResName(nterm_atom), fetchResNum(nterm_atom), 'N-term is missing a neighbouring residue!')\n",
" return 'N', residue, nterm_atom\n",
"\n",
" #check cterminus:\n",
" cterm_neighbours = cterm_atom.GetNeighbors()\n",
" if 'N' not in [fetchAtName(at) for at in cterm_neighbours]:\n",
" print(fetchResName(cterm_atom), fetchResNum(cterm_atom), 'C-term is missing a neighbouring residue!')\n",
" return 'C', residue, cterm_atom\n",
" return None\n",
"\n",
"def getResidues(mol):\n",
" residues = []\n",
" curr_resnum = -1000\n",
" residue = []\n",
"\n",
" for atom in mol.GetAtoms():\n",
" pdb_info = atom.GetPDBResidueInfo()\n",
" resnum = pdb_info.GetResidueNumber()\n",
" resname = pdb_info.GetResidueName()\n",
" if resname in ['ACE', 'NME']:\n",
" continue\n",
"\n",
" if resnum!=curr_resnum:\n",
" if len(residue)>0:\n",
" residues.append(residue)\n",
" residue = []\n",
" curr_resnum = resnum\n",
" else:\n",
" pass\n",
"\n",
" residue.append(atom)\n",
" if len(residue)>0:\n",
" residues.append(residue)\n",
" return residues\n",
"\n",
"def fetchAtName(atom):\n",
" return atom.GetPDBResidueInfo().GetName().strip()\n",
"def fetchResName(atom):\n",
" return atom.GetPDBResidueInfo().GetResidueName().strip()\n",
"def fetchResNum(atom):\n",
" return atom.GetPDBResidueInfo().GetResidueNumber()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5db363ff-c103-48d5-ae85-a6ce08d73e26",
"metadata": {},
"outputs": [],
"source": [
"mol = Chem.MolFromPDBFile('../pocket.pdb', \n",
" #removeHs=False\n",
" )\n",
"\n",
"residues = getResidues(mol)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a0f07131-66da-447e-acd8-e2fc82913f96",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TYR 514 N-term is missing a neighbouring residue!\n",
"110\n",
"TYR 514 C-term is missing a neighbouring residue!\n",
"LEU 625 N-term is missing a neighbouring residue!\n",
"115\n",
"LEU 625 C-term is missing a neighbouring residue!\n",
"ASP 664 N-term is missing a neighbouring residue!\n",
"120\n",
"LEU 665 C-term is missing a neighbouring residue!\n",
"SER 667 N-term is missing a neighbouring residue!\n",
"125\n",
"VAL 668 C-term is missing a neighbouring residue!\n",
"ILE 682 N-term is missing a neighbouring residue!\n",
"130\n",
"TYR 683 C-term is missing a neighbouring residue!\n",
"PHE 686 N-term is missing a neighbouring residue!\n",
"135\n",
"PHE 686 C-term is missing a neighbouring residue!\n",
"MET 703 N-term is missing a neighbouring residue!\n",
"140\n",
"MET 703 C-term is missing a neighbouring residue!\n",
"GLY 715 N-term is missing a neighbouring residue!\n",
"145\n",
"GLN 716 C-term is missing a neighbouring residue!\n",
"PHE 719 N-term is missing a neighbouring residue!\n",
"150\n",
"PHE 719 C-term is missing a neighbouring residue!\n"
]
}
],
"source": [
"while (test := terminalCapsTest(residues)) is not None:\n",
" end, residue, atom = test\n",
" resdict = dict(zip([fetchAtName(at) for at in residue] , residue))\n",
" \n",
" if end=='N':\n",
" nterm_atom = atom\n",
" mol = addNTerm(mol, residue, nterm_atom)\n",
" print(mol.GetNumBonds())\n",
" if end=='C':\n",
" cterm_atom = atom\n",
" mol = addCTerm(mol, residue, cterm_atom)\n",
" residues = getResidues(mol)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "007581b0-d731-4167-bd09-03d626b1bd78",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A A A A A A A A A A A A B B B B B B B B C C C C C C C C C C C C C C C C D D D D D D D D D D D D D E E E E E E E E E E E E E E E E E E E E F F F F F F F F F F F G G G G G G G G H H H H H H H H H H H H H I I I I I I I I I I I A A A A A B B B B B C C C C C D D D D D E E E E E F F F F F G G G G G H H H H H I I I I I "
]
}
],
"source": [
"# reset the residue information\n",
"# and make each bonded component a separate chain\n",
"adj = Chem.GetAdjacencyMatrix(mol)\n",
"\n",
"nchains, chain_indices = scipy.sparse.csgraph.connected_components(adj)\n",
"\n",
"chains = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'\n",
"\n",
"for atom, chain_id in zip(mol.GetAtoms(), chain_indices):\n",
" pdb_info = atom.GetPDBResidueInfo()\n",
" chain = chains[chain_id]\n",
" \n",
" pdb_info.SetChainId(chain)\n",
" print(pdb_info.GetChainId(), end=' ')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "96ce35ae-85f1-45d8-8f09-f15b6069950f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"new_order = pd.DataFrame({'resid':[fetchResNum(at) for at in mol.GetAtoms()],\n",
" 'atnum':range(mol.GetNumAtoms())}).sort_values(['resid', 'atnum'])['atnum']\n",
"\n",
"reordered = Chem.RenumberAtoms(mol, [int(i) for i in new_order])\n",
"Chem.MolToPDBFile(reordered,'pdb3.pdb', )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "fdbb0f0a-d709-4d98-87dd-64d7f055ce1e",
"metadata": {},
"outputs": [],
"source": [
"from openmm import app\n",
"from openmm import unit\n",
"import openmm\n",
"import pdbfixer\n",
"fixer = pdbfixer.PDBFixer('./pdb3.pdb')\n",
"#fixer.findMissingResidues()\n",
"#fixer.findMissingAtoms()\n",
"#fixer.addMissingAtoms()\n",
"fixer.addMissingHydrogens(7.0)\n",
"app.PDBFile.writeFile(fixer.topology, fixer.positions, open('output.pdb', 'w'))\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "23bf57ee-a372-48ff-b4a4-0296a2b4b539",
"metadata": {},
"outputs": [],
"source": [
"forcefield = app.ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')\n",
"system = forcefield.createSystem(\n",
" fixer.topology,\n",
" nonbondedMethod=app.CutoffNonPeriodic,\n",
" nonbondedCutoff=0.9*unit.nanometer\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a8c79fa3-c7bb-4e64-b351-43c755abd845",
"metadata": {},
"outputs": [],
"source": [
"integrator = openmm.LangevinIntegrator(\n",
" 298*unit.kelvin,\n",
" 1/unit.picosecond,\n",
" 1*unit.femtosecond\n",
" )\n",
"platform = openmm.Platform.getPlatformByName('CPU')\n",
"simulation = app.Simulation(fixer.topology, system, integrator, platform)\n",
"simulation.context.setPositions(fixer.positions)\n",
"simulation.minimizeEnergy()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1768799b-7f60-4ced-b196-240e8b100913",
"metadata": {},
"outputs": [],
"source": [
"simulation.step(1000)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3190e0be-b661-4be5-93d2-d0874a7a990c",
"metadata": {},
"outputs": [],
"source": [
"app.PDBFile.writeFile(fixer.topology,\n",
" simulation.context.getState(getPositions=True).getPositions(),\n",
" open('./minimized.pdb', 'w'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64a41e3d-cd0c-4028-998b-98f94cd46658",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment