ljmartin · November 26, 2023 04:17
diff --git a/addCaps-start-to-finish.ipynb b/addCaps-start-to-finish.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d25c26dc-715e-4083-ab7b-0af4b356ae37",
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "from rdkit.Chem import AllChem\n",
    "from rdkit.Geometry import rdGeometry\n",
    "\n",
    "import itertools\n",
    "import scipy "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4b22d53c-80e1-4af9-bb08-b08c47b72af8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def addNTerm(mol, residue, nterm_atom):\n",
    "    \"\"\"\n",
    "    Hardcoded addition of n-terminal ACE residue. \n",
    "    Performs no error checking at all. \n",
    "    Expects that the existing residue conforms to PDB spec\n",
    "    for amino acids (in particular, the backbone must be N-C-CA!)\n",
    "    \n",
    "    \"\"\"\n",
    "    \n",
    "    ## start by getting an actual ACE molecule - \n",
    "    ## with a two-atom overlap to it's parent.\n",
    "    namesdict = dict(zip([fetchAtName(at) for at in residue], range(len(residue))))\n",
    "    #becuase it's N-term, we know this must be an ACE.\n",
    "    ace = Chem.MolFromSmiles('O=C(C)[N@@H]C')\n",
    "    ace = Chem.AddHs(ace)\n",
    "\n",
    "    #order is N-C. The N comes first. \n",
    "    matching_atoms = Chem.MolFromSmarts('[#7D3]-[#6D4]')\n",
    "    ace.GetSubstructMatch(matching_atoms)\n",
    "    \n",
    "    ## embed the molecule in 3D, using the two matching\n",
    "    ## atoms as constraints.\n",
    "    #first, generate the constraint dict. \n",
    "    cmap = {}\n",
    "    ace_match_atoms = ace.GetSubstructMatch(matching_atoms)\n",
    "    for idx, name in zip(ace_match_atoms, ['N', 'CA']):\n",
    "        #cmap[idx] = mol.GetConformer().GetAtomPosition(idx)\n",
    "        parent_idx = namesdict[name]\n",
    "        cmap[idx] = mol.GetConformer().GetAtomPosition(parent_idx)\n",
    "        \n",
    "    #embed 3d distances to match parent:\n",
    "    AllChem.EmbedMolecule(ace, coordMap=cmap, enforceChirality=True)\n",
    "\n",
    "    #align (transform and rotate) to the parent residue:\n",
    "    AllChem.AlignMol(\n",
    "        ace,\n",
    "        mol,\n",
    "        atomMap = list(zip(\n",
    "            ace_match_atoms, \n",
    "            [residue[namesdict['N']].GetIdx(), residue[namesdict['CA']].GetIdx()]\n",
    "        )))\n",
    "    \n",
    "    \n",
    "    ## now comes the joing process. We can remove some superfluos atoms\n",
    "    ## and the remaining atoms must be added. \n",
    "    ## note this is all hardcoded - the exact same atoms get added in every ACE,\n",
    "    ## because we only aligned to the backbone. \n",
    "    \n",
    "    ace = Chem.RemoveHs(ace)\n",
    "    \n",
    "    #remove the superfluous atoms (i.e. duplicate H, C, N)\n",
    "    indices = set([i for i in range(ace.GetNumAtoms())])\n",
    "    indices_to_remove = set(ace_match_atoms)\n",
    "    indices = list(indices.difference(indices_to_remove))\n",
    "    \n",
    "    #find the bond indices so we can do a chem.pathtosubmol\n",
    "    bonds = []\n",
    "    for pair in itertools.combinations(indices, 2):\n",
    "        bond = ace.GetBondBetweenAtoms(pair[0], pair[1])\n",
    "        if bond is not None:\n",
    "            #atoms are bonded! keep the\n",
    "            bonds.append(bond.GetIdx())\n",
    "            \n",
    "    #now... we have the ACE atoms that we need to add!\n",
    "    #turn those into a separate molecule:\n",
    "    ats_to_add = Chem.PathToSubmol(ace, bonds)\n",
    "    add_conf = ats_to_add.GetConformer(0)\n",
    "    \n",
    "    ## the last step is cumbersome. We individually add each atom \n",
    "    ## in the ACE residue - alpha carbon, carbonyl O, and terminal methyl\n",
    "    ## we start by generating a read-write mol, and must set\n",
    "    ## the PDB info for each added atom, then add a bond, and set the \n",
    "    ## conformer positions. \n",
    "    \n",
    "    rwmol = Chem.RWMol(mol)\n",
    "    \n",
    "    ##build out the ACE, bond by bond. \n",
    "\n",
    "    #first the ACE alpha carbon. \n",
    "    ac = Chem.MolFromSmarts('[$([#6]=[#8])]')\n",
    "    idx = ats_to_add.GetSubstructMatch(ac)[0]\n",
    "    at = ats_to_add.GetAtomWithIdx(idx)\n",
    "\n",
    "    newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
    "    newinf.SetResidueName(\"ACE\")\n",
    "    newinf.SetName(\" C  \")\n",
    "    newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
    "    newinf.SetChainId('A')\n",
    "    at.SetPDBResidueInfo(newinf)\n",
    "\n",
    "    alpha_carbon_idx = rwmol.AddAtom(at)\n",
    "    rwmol.AddBond(alpha_carbon_idx, nterm_atom.GetIdx())\n",
    "    conformer = rwmol.GetConformer(0)\n",
    "    conformer.SetAtomPosition(alpha_carbon_idx, add_conf.GetAtomPosition(idx))\n",
    "\n",
    "\n",
    "    #fix the carbonyl next. \n",
    "    carbonyl = Chem.MolFromSmarts(\"[#8D1]\")\n",
    "    idx = ats_to_add.GetSubstructMatch(carbonyl)[0]\n",
    "    at = ats_to_add.GetAtomWithIdx(idx)\n",
    "\n",
    "    newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
    "    newinf.SetResidueName(\"ACE\")\n",
    "    newinf.SetName(\" O  \")\n",
    "    newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
    "    newinf.SetChainId('A')\n",
    "    at.SetPDBResidueInfo(newinf)\n",
    "\n",
    "    aidx = rwmol.AddAtom(at)\n",
    "    rwmol.AddBond(aidx, alpha_carbon_idx)\n",
    "    conformer = rwmol.GetConformer(0)\n",
    "    conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
    "\n",
    "\n",
    "    #next the ACE beta carbon. \n",
    "    bc = Chem.MolFromSmarts('[$([#6D1]-[#6D2])]')\n",
    "    idx = ats_to_add.GetSubstructMatch(bc)[0]\n",
    "    at = ats_to_add.GetAtomWithIdx(idx)\n",
    "\n",
    "    newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
    "    newinf.SetResidueName(\"ACE\")\n",
    "    newinf.SetName(\" CH3\")\n",
    "    newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
    "    newinf.SetChainId('A')\n",
    "    at.SetPDBResidueInfo(newinf)\n",
    "\n",
    "    aidx = rwmol.AddAtom(at)\n",
    "    rwmol.AddBond(aidx, alpha_carbon_idx)\n",
    "    conformer = rwmol.GetConformer(0)\n",
    "    conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
    "\n",
    "    return Chem.Mol(rwmol)\n",
    "\n",
    "def addCTerm(mol, residue, cterm_atom):\n",
    "    ## start by getting an actual ACE molecule - \n",
    "    ## with a two-atom overlap to it's parent.\n",
    "    namesdict = dict(zip([fetchAtName(at) for at in residue], range(len(residue))))\n",
    "    \n",
    "    #becuase it's C-term, we know this must be an NME.\n",
    "    nme = Chem.MolFromSmiles('N(C(C(N))=O)C')\n",
    "    nme = Chem.AddHs(nme)\n",
    "    \n",
    "    matching_atoms = Chem.MolFromSmarts('[#8D1]=[#6D3]-[#6]-[#7]')\n",
    "    \n",
    "    \n",
    "    ## embed the molecule in 3D, using the THREE matching\n",
    "    ## atoms as constraints.\n",
    "    #first, generate the constraint dict. \n",
    "    cmap = {}\n",
    "    nme_match_atoms = nme.GetSubstructMatch(matching_atoms)\n",
    "    for idx, name in zip(nme_match_atoms, ['O', 'C', 'CA', 'N']):\n",
    "        parent_idx = namesdict[name]\n",
    "        cmap[idx] = mol.GetConformer().GetAtomPosition(parent_idx)\n",
    "        \n",
    "    #embed 3d distances to match parent:\n",
    "    AllChem.EmbedMolecule(nme, coordMap=cmap, )\n",
    "\n",
    "    #align (transform and rotate) to the parent residue:\n",
    "    AllChem.AlignMol(\n",
    "        nme,\n",
    "        mol,\n",
    "        atomMap = list(zip(\n",
    "            nme_match_atoms, \n",
    "            [residue[namesdict[i]].GetIdx() for i in ['O', 'C', 'CA', 'N']]\n",
    "\n",
    "        )))\n",
    "    \n",
    "    ## now comes the joing process. We can remove some superfluos atoms\n",
    "    ## and the remaining atoms must be added. \n",
    "    ## note this is all hardcoded - the exact same atoms get added in every ACE,\n",
    "    ## because we only aligned to the backbone. \n",
    "\n",
    "    nme = Chem.RemoveHs(nme)\n",
    "    \n",
    "    #remove the superfluous atoms (i.e. duplicate H, C, N)\n",
    "    indices = set([i for i in range(nme.GetNumAtoms())])\n",
    "    indices_to_remove = set(nme_match_atoms)\n",
    "    indices = list(indices.difference(indices_to_remove))\n",
    "\n",
    "    #find the bond indices so we can do a chem.pathtosubmol\n",
    "    bonds = []\n",
    "    for pair in itertools.combinations(indices, 2):\n",
    "        bond = nme.GetBondBetweenAtoms(pair[0], pair[1])\n",
    "        if bond is not None:\n",
    "            #atoms are bonded! keep the\n",
    "            bonds.append(bond.GetIdx())\n",
    "            \n",
    "    #now... we have the NME atoms that we need to add!\n",
    "    #turn those into a separate molecule:\n",
    "    ats_to_add = Chem.PathToSubmol(nme, bonds)\n",
    "    add_conf = ats_to_add.GetConformer(0)\n",
    "    \n",
    "    ## the last step is cumbersome. We individually add each atom \n",
    "    ## in the ACE residue - alpha carbon, carbonyl O, and terminal methyl\n",
    "    ## we start by generating a read-write mol, and must set\n",
    "    ## the PDB info for each added atom, then add a bond, and set the \n",
    "    ## conformer positions. \n",
    "\n",
    "    rwmol = Chem.RWMol(mol)\n",
    "\n",
    "    ##build out the NME, bond by bond. \n",
    "    \n",
    "    #first, select to-be-added nitrogen.\n",
    "    nm = Chem.MolFromSmarts('[$([#7]-[#6])]')\n",
    "    idx = ats_to_add.GetSubstructMatch(nm)[0]\n",
    "    at = ats_to_add.GetAtomWithIdx(idx)\n",
    "\n",
    "    newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
    "    newinf.SetResidueName(\"NME\")\n",
    "    newinf.SetName(\" N  \")\n",
    "    newinf.SetResidueNumber(int(fetchResNum(cterm_atom))+1)\n",
    "    newinf.SetChainId('A')\n",
    "    at.SetPDBResidueInfo(newinf)\n",
    "    \n",
    "    nmethyl_idx = rwmol.AddAtom(at)\n",
    "    rwmol.AddBond(nmethyl_idx, cterm_atom.GetIdx())\n",
    "    conformer = rwmol.GetConformer(0)\n",
    "    conformer.SetAtomPosition(nmethyl_idx, add_conf.GetAtomPosition(idx))\n",
    "\n",
    "\n",
    "    #fix the terminal methyl next. \n",
    "    methyl = Chem.MolFromSmarts(\"[#6]\")\n",
    "    idx = ats_to_add.GetSubstructMatch(methyl)[0]\n",
    "    at = ats_to_add.GetAtomWithIdx(idx)\n",
    "\n",
    "    newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
    "    newinf.SetResidueName(\"NME\")\n",
    "    newinf.SetName(\" C  \")\n",
    "    newinf.SetResidueNumber(int(fetchResNum(cterm_atom))+1)\n",
    "    newinf.SetChainId('A')\n",
    "    at.SetPDBResidueInfo(newinf)\n",
    "\n",
    "    aidx = rwmol.AddAtom(at)\n",
    "    rwmol.AddBond(aidx, nmethyl_idx)\n",
    "    conformer = rwmol.GetConformer(0)\n",
    "    conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
    "\n",
    "    return Chem.Mol(rwmol)\n",
    "\n",
    "def terminalCapsTest(residues):\n",
    "    for residue in residues:\n",
    "        resdict = dict(zip([fetchAtName(at) for at in residue] , residue))\n",
    "\n",
    "        nterm_atom = resdict['N']\n",
    "        cterm_atom = resdict['C']\n",
    "\n",
    "        #check nterminus:\n",
    "        nterm_neighbours = nterm_atom.GetNeighbors()\n",
    "        if 'C' not in [fetchAtName(at) for at in nterm_neighbours]:\n",
    "            print(fetchResName(nterm_atom), fetchResNum(nterm_atom), 'N-term is missing a neighbouring residue!')\n",
    "            return 'N', residue, nterm_atom\n",
    "\n",
    "        #check cterminus:\n",
    "        cterm_neighbours = cterm_atom.GetNeighbors()\n",
    "        if 'N' not in [fetchAtName(at) for at in cterm_neighbours]:\n",
    "            print(fetchResName(cterm_atom), fetchResNum(cterm_atom), 'C-term is missing a neighbouring residue!')\n",
    "            return 'C', residue, cterm_atom\n",
    "    return None\n",
    "\n",
    "def getResidues(mol):\n",
    "    residues = []\n",
    "    curr_resnum = -1000\n",
    "    residue = []\n",
    "\n",
    "    for atom in mol.GetAtoms():\n",
    "        pdb_info = atom.GetPDBResidueInfo()\n",
    "        resnum = pdb_info.GetResidueNumber()\n",
    "        resname = pdb_info.GetResidueName()\n",
    "        if resname in ['ACE', 'NME']:\n",
    "            continue\n",
    "\n",
    "        if resnum!=curr_resnum:\n",
    "            if len(residue)>0:\n",
    "                residues.append(residue)\n",
    "            residue = []\n",
    "            curr_resnum = resnum\n",
    "        else:\n",
    "            pass\n",
    "\n",
    "        residue.append(atom)\n",
    "    if len(residue)>0:\n",
    "        residues.append(residue)\n",
    "    return residues\n",
    "\n",
    "def fetchAtName(atom):\n",
    "    return atom.GetPDBResidueInfo().GetName().strip()\n",
    "def fetchResName(atom):\n",
    "    return atom.GetPDBResidueInfo().GetResidueName().strip()\n",
    "def fetchResNum(atom):\n",
    "    return atom.GetPDBResidueInfo().GetResidueNumber()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5db363ff-c103-48d5-ae85-a6ce08d73e26",
   "metadata": {},
   "outputs": [],
   "source": [
    "mol = Chem.MolFromPDBFile('../pocket.pdb', \n",
    "                          #removeHs=False\n",
    "                         )\n",
    "\n",
    "residues = getResidues(mol)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a0f07131-66da-447e-acd8-e2fc82913f96",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TYR 514 N-term is missing a neighbouring residue!\n",
      "110\n",
      "TYR 514 C-term is missing a neighbouring residue!\n",
      "LEU 625 N-term is missing a neighbouring residue!\n",
      "115\n",
      "LEU 625 C-term is missing a neighbouring residue!\n",
      "ASP 664 N-term is missing a neighbouring residue!\n",
      "120\n",
      "LEU 665 C-term is missing a neighbouring residue!\n",
      "SER 667 N-term is missing a neighbouring residue!\n",
      "125\n",
      "VAL 668 C-term is missing a neighbouring residue!\n",
      "ILE 682 N-term is missing a neighbouring residue!\n",
      "130\n",
      "TYR 683 C-term is missing a neighbouring residue!\n",
      "PHE 686 N-term is missing a neighbouring residue!\n",
      "135\n",
      "PHE 686 C-term is missing a neighbouring residue!\n",
      "MET 703 N-term is missing a neighbouring residue!\n",
      "140\n",
      "MET 703 C-term is missing a neighbouring residue!\n",
      "GLY 715 N-term is missing a neighbouring residue!\n",
      "145\n",
      "GLN 716 C-term is missing a neighbouring residue!\n",
      "PHE 719 N-term is missing a neighbouring residue!\n",
      "150\n",
      "PHE 719 C-term is missing a neighbouring residue!\n"
     ]
    }
   ],
   "source": [
    "while (test := terminalCapsTest(residues)) is not None:\n",
    "    end, residue, atom = test\n",
    "    resdict = dict(zip([fetchAtName(at) for at in residue] , residue))\n",
    "    \n",
    "    if end=='N':\n",
    "        nterm_atom = atom\n",
    "        mol = addNTerm(mol, residue, nterm_atom)\n",
    "        print(mol.GetNumBonds())\n",
    "    if end=='C':\n",
    "        cterm_atom = atom\n",
    "        mol = addCTerm(mol, residue, cterm_atom)\n",
    "    residues = getResidues(mol)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "007581b0-d731-4167-bd09-03d626b1bd78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A A A A A A A A A A A A B B B B B B B B C C C C C C C C C C C C C C C C D D D D D D D D D D D D D E E E E E E E E E E E E E E E E E E E E F F F F F F F F F F F G G G G G G G G H H H H H H H H H H H H H I I I I I I I I I I I A A A A A B B B B B C C C C C D D D D D E E E E E F F F F F G G G G G H H H H H I I I I I "
     ]
    }
   ],
   "source": [
    "# reset the residue information\n",
    "# and make each bonded component a separate chain\n",
    "adj = Chem.GetAdjacencyMatrix(mol)\n",
    "\n",
    "nchains, chain_indices = scipy.sparse.csgraph.connected_components(adj)\n",
    "\n",
    "chains = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'\n",
    "\n",
    "for atom, chain_id in zip(mol.GetAtoms(), chain_indices):\n",
    "    pdb_info = atom.GetPDBResidueInfo()\n",
    "    chain = chains[chain_id]\n",
    "    \n",
    "    pdb_info.SetChainId(chain)\n",
    "    print(pdb_info.GetChainId(), end=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "96ce35ae-85f1-45d8-8f09-f15b6069950f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "new_order = pd.DataFrame({'resid':[fetchResNum(at) for at in mol.GetAtoms()],\n",
    "             'atnum':range(mol.GetNumAtoms())}).sort_values(['resid', 'atnum'])['atnum']\n",
    "\n",
    "reordered = Chem.RenumberAtoms(mol, [int(i) for i in new_order])\n",
    "Chem.MolToPDBFile(reordered,'pdb3.pdb', )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "fdbb0f0a-d709-4d98-87dd-64d7f055ce1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openmm import app\n",
    "from openmm import unit\n",
    "import openmm\n",
    "import pdbfixer\n",
    "fixer = pdbfixer.PDBFixer('./pdb3.pdb')\n",
    "#fixer.findMissingResidues()\n",
    "#fixer.findMissingAtoms()\n",
    "#fixer.addMissingAtoms()\n",
    "fixer.addMissingHydrogens(7.0)\n",
    "app.PDBFile.writeFile(fixer.topology, fixer.positions, open('output.pdb', 'w'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "23bf57ee-a372-48ff-b4a4-0296a2b4b539",
   "metadata": {},
   "outputs": [],
   "source": [
    "forcefield = app.ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')\n",
    "system = forcefield.createSystem(\n",
    "        fixer.topology,\n",
    "        nonbondedMethod=app.CutoffNonPeriodic,\n",
    "        nonbondedCutoff=0.9*unit.nanometer\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a8c79fa3-c7bb-4e64-b351-43c755abd845",
   "metadata": {},
   "outputs": [],
   "source": [
    "integrator = openmm.LangevinIntegrator(\n",
    "        298*unit.kelvin,\n",
    "        1/unit.picosecond,\n",
    "        1*unit.femtosecond\n",
    "    )\n",
    "platform = openmm.Platform.getPlatformByName('CPU')\n",
    "simulation = app.Simulation(fixer.topology, system, integrator, platform)\n",
    "simulation.context.setPositions(fixer.positions)\n",
    "simulation.minimizeEnergy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1768799b-7f60-4ced-b196-240e8b100913",
   "metadata": {},
   "outputs": [],
   "source": [
    "simulation.step(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3190e0be-b661-4be5-93d2-d0874a7a990c",
   "metadata": {},
   "outputs": [],
   "source": [
    "app.PDBFile.writeFile(fixer.topology,\n",
    "                      simulation.context.getState(getPositions=True).getPositions(),\n",
    "                      open('./minimized.pdb', 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64a41e3d-cd0c-4028-998b-98f94cd46658",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d25c26dc-715e-4083-ab7b-0af4b356ae37",
	"metadata": {},
	"outputs": [],
	"source": [
	"from rdkit import Chem\n",
	"from rdkit.Chem import AllChem\n",
	"from rdkit.Geometry import rdGeometry\n",
	"\n",
	"import itertools\n",
	"import scipy "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "4b22d53c-80e1-4af9-bb08-b08c47b72af8",
	"metadata": {},
	"outputs": [],
	"source": [
	"def addNTerm(mol, residue, nterm_atom):\n",
	" \"\"\"\n",
	" Hardcoded addition of n-terminal ACE residue. \n",
	" Performs no error checking at all. \n",
	" Expects that the existing residue conforms to PDB spec\n",
	" for amino acids (in particular, the backbone must be N-C-CA!)\n",
	" \n",
	" \"\"\"\n",
	" \n",
	" ## start by getting an actual ACE molecule - \n",
	" ## with a two-atom overlap to it's parent.\n",
	" namesdict = dict(zip([fetchAtName(at) for at in residue], range(len(residue))))\n",
	" #becuase it's N-term, we know this must be an ACE.\n",
	" ace = Chem.MolFromSmiles('O=C(C)[N@@H]C')\n",
	" ace = Chem.AddHs(ace)\n",
	"\n",
	" #order is N-C. The N comes first. \n",
	" matching_atoms = Chem.MolFromSmarts('[#7D3]-[#6D4]')\n",
	" ace.GetSubstructMatch(matching_atoms)\n",
	" \n",
	" ## embed the molecule in 3D, using the two matching\n",
	" ## atoms as constraints.\n",
	" #first, generate the constraint dict. \n",
	" cmap = {}\n",
	" ace_match_atoms = ace.GetSubstructMatch(matching_atoms)\n",
	" for idx, name in zip(ace_match_atoms, ['N', 'CA']):\n",
	" #cmap[idx] = mol.GetConformer().GetAtomPosition(idx)\n",
	" parent_idx = namesdict[name]\n",
	" cmap[idx] = mol.GetConformer().GetAtomPosition(parent_idx)\n",
	" \n",
	" #embed 3d distances to match parent:\n",
	" AllChem.EmbedMolecule(ace, coordMap=cmap, enforceChirality=True)\n",
	"\n",
	" #align (transform and rotate) to the parent residue:\n",
	" AllChem.AlignMol(\n",
	" ace,\n",
	" mol,\n",
	" atomMap = list(zip(\n",
	" ace_match_atoms, \n",
	" [residue[namesdict['N']].GetIdx(), residue[namesdict['CA']].GetIdx()]\n",
	" )))\n",
	" \n",
	" \n",
	" ## now comes the joing process. We can remove some superfluos atoms\n",
	" ## and the remaining atoms must be added. \n",
	" ## note this is all hardcoded - the exact same atoms get added in every ACE,\n",
	" ## because we only aligned to the backbone. \n",
	" \n",
	" ace = Chem.RemoveHs(ace)\n",
	" \n",
	" #remove the superfluous atoms (i.e. duplicate H, C, N)\n",
	" indices = set([i for i in range(ace.GetNumAtoms())])\n",
	" indices_to_remove = set(ace_match_atoms)\n",
	" indices = list(indices.difference(indices_to_remove))\n",
	" \n",
	" #find the bond indices so we can do a chem.pathtosubmol\n",
	" bonds = []\n",
	" for pair in itertools.combinations(indices, 2):\n",
	" bond = ace.GetBondBetweenAtoms(pair[0], pair[1])\n",
	" if bond is not None:\n",
	" #atoms are bonded! keep the\n",
	" bonds.append(bond.GetIdx())\n",
	" \n",
	" #now... we have the ACE atoms that we need to add!\n",
	" #turn those into a separate molecule:\n",
	" ats_to_add = Chem.PathToSubmol(ace, bonds)\n",
	" add_conf = ats_to_add.GetConformer(0)\n",
	" \n",
	" ## the last step is cumbersome. We individually add each atom \n",
	" ## in the ACE residue - alpha carbon, carbonyl O, and terminal methyl\n",
	" ## we start by generating a read-write mol, and must set\n",
	" ## the PDB info for each added atom, then add a bond, and set the \n",
	" ## conformer positions. \n",
	" \n",
	" rwmol = Chem.RWMol(mol)\n",
	" \n",
	" ##build out the ACE, bond by bond. \n",
	"\n",
	" #first the ACE alpha carbon. \n",
	" ac = Chem.MolFromSmarts('[$([#6]=[#8])]')\n",
	" idx = ats_to_add.GetSubstructMatch(ac)[0]\n",
	" at = ats_to_add.GetAtomWithIdx(idx)\n",
	"\n",
	" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
	" newinf.SetResidueName(\"ACE\")\n",
	" newinf.SetName(\" C \")\n",
	" newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
	" newinf.SetChainId('A')\n",
	" at.SetPDBResidueInfo(newinf)\n",
	"\n",
	" alpha_carbon_idx = rwmol.AddAtom(at)\n",
	" rwmol.AddBond(alpha_carbon_idx, nterm_atom.GetIdx())\n",
	" conformer = rwmol.GetConformer(0)\n",
	" conformer.SetAtomPosition(alpha_carbon_idx, add_conf.GetAtomPosition(idx))\n",
	"\n",
	"\n",
	" #fix the carbonyl next. \n",
	" carbonyl = Chem.MolFromSmarts(\"[#8D1]\")\n",
	" idx = ats_to_add.GetSubstructMatch(carbonyl)[0]\n",
	" at = ats_to_add.GetAtomWithIdx(idx)\n",
	"\n",
	" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
	" newinf.SetResidueName(\"ACE\")\n",
	" newinf.SetName(\" O \")\n",
	" newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
	" newinf.SetChainId('A')\n",
	" at.SetPDBResidueInfo(newinf)\n",
	"\n",
	" aidx = rwmol.AddAtom(at)\n",
	" rwmol.AddBond(aidx, alpha_carbon_idx)\n",
	" conformer = rwmol.GetConformer(0)\n",
	" conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
	"\n",
	"\n",
	" #next the ACE beta carbon. \n",
	" bc = Chem.MolFromSmarts('[$([#6D1]-[#6D2])]')\n",
	" idx = ats_to_add.GetSubstructMatch(bc)[0]\n",
	" at = ats_to_add.GetAtomWithIdx(idx)\n",
	"\n",
	" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
	" newinf.SetResidueName(\"ACE\")\n",
	" newinf.SetName(\" CH3\")\n",
	" newinf.SetResidueNumber(int(fetchResNum(nterm_atom))-1)\n",
	" newinf.SetChainId('A')\n",
	" at.SetPDBResidueInfo(newinf)\n",
	"\n",
	" aidx = rwmol.AddAtom(at)\n",
	" rwmol.AddBond(aidx, alpha_carbon_idx)\n",
	" conformer = rwmol.GetConformer(0)\n",
	" conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
	"\n",
	" return Chem.Mol(rwmol)\n",
	"\n",
	"def addCTerm(mol, residue, cterm_atom):\n",
	" ## start by getting an actual ACE molecule - \n",
	" ## with a two-atom overlap to it's parent.\n",
	" namesdict = dict(zip([fetchAtName(at) for at in residue], range(len(residue))))\n",
	" \n",
	" #becuase it's C-term, we know this must be an NME.\n",
	" nme = Chem.MolFromSmiles('N(C(C(N))=O)C')\n",
	" nme = Chem.AddHs(nme)\n",
	" \n",
	" matching_atoms = Chem.MolFromSmarts('[#8D1]=[#6D3]-[#6]-[#7]')\n",
	" \n",
	" \n",
	" ## embed the molecule in 3D, using the THREE matching\n",
	" ## atoms as constraints.\n",
	" #first, generate the constraint dict. \n",
	" cmap = {}\n",
	" nme_match_atoms = nme.GetSubstructMatch(matching_atoms)\n",
	" for idx, name in zip(nme_match_atoms, ['O', 'C', 'CA', 'N']):\n",
	" parent_idx = namesdict[name]\n",
	" cmap[idx] = mol.GetConformer().GetAtomPosition(parent_idx)\n",
	" \n",
	" #embed 3d distances to match parent:\n",
	" AllChem.EmbedMolecule(nme, coordMap=cmap, )\n",
	"\n",
	" #align (transform and rotate) to the parent residue:\n",
	" AllChem.AlignMol(\n",
	" nme,\n",
	" mol,\n",
	" atomMap = list(zip(\n",
	" nme_match_atoms, \n",
	" [residue[namesdict[i]].GetIdx() for i in ['O', 'C', 'CA', 'N']]\n",
	"\n",
	" )))\n",
	" \n",
	" ## now comes the joing process. We can remove some superfluos atoms\n",
	" ## and the remaining atoms must be added. \n",
	" ## note this is all hardcoded - the exact same atoms get added in every ACE,\n",
	" ## because we only aligned to the backbone. \n",
	"\n",
	" nme = Chem.RemoveHs(nme)\n",
	" \n",
	" #remove the superfluous atoms (i.e. duplicate H, C, N)\n",
	" indices = set([i for i in range(nme.GetNumAtoms())])\n",
	" indices_to_remove = set(nme_match_atoms)\n",
	" indices = list(indices.difference(indices_to_remove))\n",
	"\n",
	" #find the bond indices so we can do a chem.pathtosubmol\n",
	" bonds = []\n",
	" for pair in itertools.combinations(indices, 2):\n",
	" bond = nme.GetBondBetweenAtoms(pair[0], pair[1])\n",
	" if bond is not None:\n",
	" #atoms are bonded! keep the\n",
	" bonds.append(bond.GetIdx())\n",
	" \n",
	" #now... we have the NME atoms that we need to add!\n",
	" #turn those into a separate molecule:\n",
	" ats_to_add = Chem.PathToSubmol(nme, bonds)\n",
	" add_conf = ats_to_add.GetConformer(0)\n",
	" \n",
	" ## the last step is cumbersome. We individually add each atom \n",
	" ## in the ACE residue - alpha carbon, carbonyl O, and terminal methyl\n",
	" ## we start by generating a read-write mol, and must set\n",
	" ## the PDB info for each added atom, then add a bond, and set the \n",
	" ## conformer positions. \n",
	"\n",
	" rwmol = Chem.RWMol(mol)\n",
	"\n",
	" ##build out the NME, bond by bond. \n",
	" \n",
	" #first, select to-be-added nitrogen.\n",
	" nm = Chem.MolFromSmarts('[$([#7]-[#6])]')\n",
	" idx = ats_to_add.GetSubstructMatch(nm)[0]\n",
	" at = ats_to_add.GetAtomWithIdx(idx)\n",
	"\n",
	" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
	" newinf.SetResidueName(\"NME\")\n",
	" newinf.SetName(\" N \")\n",
	" newinf.SetResidueNumber(int(fetchResNum(cterm_atom))+1)\n",
	" newinf.SetChainId('A')\n",
	" at.SetPDBResidueInfo(newinf)\n",
	" \n",
	" nmethyl_idx = rwmol.AddAtom(at)\n",
	" rwmol.AddBond(nmethyl_idx, cterm_atom.GetIdx())\n",
	" conformer = rwmol.GetConformer(0)\n",
	" conformer.SetAtomPosition(nmethyl_idx, add_conf.GetAtomPosition(idx))\n",
	"\n",
	"\n",
	" #fix the terminal methyl next. \n",
	" methyl = Chem.MolFromSmarts(\"[#6]\")\n",
	" idx = ats_to_add.GetSubstructMatch(methyl)[0]\n",
	" at = ats_to_add.GetAtomWithIdx(idx)\n",
	"\n",
	" newinf = Chem.rdchem.AtomPDBResidueInfo()\n",
	" newinf.SetResidueName(\"NME\")\n",
	" newinf.SetName(\" C \")\n",
	" newinf.SetResidueNumber(int(fetchResNum(cterm_atom))+1)\n",
	" newinf.SetChainId('A')\n",
	" at.SetPDBResidueInfo(newinf)\n",
	"\n",
	" aidx = rwmol.AddAtom(at)\n",
	" rwmol.AddBond(aidx, nmethyl_idx)\n",
	" conformer = rwmol.GetConformer(0)\n",
	" conformer.SetAtomPosition(aidx, add_conf.GetAtomPosition(idx))\n",
	"\n",
	" return Chem.Mol(rwmol)\n",
	"\n",
	"def terminalCapsTest(residues):\n",
	" for residue in residues:\n",
	" resdict = dict(zip([fetchAtName(at) for at in residue] , residue))\n",
	"\n",
	" nterm_atom = resdict['N']\n",
	" cterm_atom = resdict['C']\n",
	"\n",
	" #check nterminus:\n",
	" nterm_neighbours = nterm_atom.GetNeighbors()\n",
	" if 'C' not in [fetchAtName(at) for at in nterm_neighbours]:\n",
	" print(fetchResName(nterm_atom), fetchResNum(nterm_atom), 'N-term is missing a neighbouring residue!')\n",
	" return 'N', residue, nterm_atom\n",
	"\n",
	" #check cterminus:\n",
	" cterm_neighbours = cterm_atom.GetNeighbors()\n",
	" if 'N' not in [fetchAtName(at) for at in cterm_neighbours]:\n",
	" print(fetchResName(cterm_atom), fetchResNum(cterm_atom), 'C-term is missing a neighbouring residue!')\n",
	" return 'C', residue, cterm_atom\n",
	" return None\n",
	"\n",
	"def getResidues(mol):\n",
	" residues = []\n",
	" curr_resnum = -1000\n",
	" residue = []\n",
	"\n",
	" for atom in mol.GetAtoms():\n",
	" pdb_info = atom.GetPDBResidueInfo()\n",
	" resnum = pdb_info.GetResidueNumber()\n",
	" resname = pdb_info.GetResidueName()\n",
	" if resname in ['ACE', 'NME']:\n",
	" continue\n",
	"\n",
	" if resnum!=curr_resnum:\n",
	" if len(residue)>0:\n",
	" residues.append(residue)\n",
	" residue = []\n",
	" curr_resnum = resnum\n",
	" else:\n",
	" pass\n",
	"\n",
	" residue.append(atom)\n",
	" if len(residue)>0:\n",
	" residues.append(residue)\n",
	" return residues\n",
	"\n",
	"def fetchAtName(atom):\n",
	" return atom.GetPDBResidueInfo().GetName().strip()\n",
	"def fetchResName(atom):\n",
	" return atom.GetPDBResidueInfo().GetResidueName().strip()\n",
	"def fetchResNum(atom):\n",
	" return atom.GetPDBResidueInfo().GetResidueNumber()\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "5db363ff-c103-48d5-ae85-a6ce08d73e26",
	"metadata": {},
	"outputs": [],
	"source": [
	"mol = Chem.MolFromPDBFile('../pocket.pdb', \n",
	" #removeHs=False\n",
	" )\n",
	"\n",
	"residues = getResidues(mol)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "a0f07131-66da-447e-acd8-e2fc82913f96",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"TYR 514 N-term is missing a neighbouring residue!\n",
	"110\n",
	"TYR 514 C-term is missing a neighbouring residue!\n",
	"LEU 625 N-term is missing a neighbouring residue!\n",
	"115\n",
	"LEU 625 C-term is missing a neighbouring residue!\n",
	"ASP 664 N-term is missing a neighbouring residue!\n",
	"120\n",
	"LEU 665 C-term is missing a neighbouring residue!\n",
	"SER 667 N-term is missing a neighbouring residue!\n",
	"125\n",
	"VAL 668 C-term is missing a neighbouring residue!\n",
	"ILE 682 N-term is missing a neighbouring residue!\n",
	"130\n",
	"TYR 683 C-term is missing a neighbouring residue!\n",
	"PHE 686 N-term is missing a neighbouring residue!\n",
	"135\n",
	"PHE 686 C-term is missing a neighbouring residue!\n",
	"MET 703 N-term is missing a neighbouring residue!\n",
	"140\n",
	"MET 703 C-term is missing a neighbouring residue!\n",
	"GLY 715 N-term is missing a neighbouring residue!\n",
	"145\n",
	"GLN 716 C-term is missing a neighbouring residue!\n",
	"PHE 719 N-term is missing a neighbouring residue!\n",
	"150\n",
	"PHE 719 C-term is missing a neighbouring residue!\n"
	]
	}
	],
	"source": [
	"while (test := terminalCapsTest(residues)) is not None:\n",
	" end, residue, atom = test\n",
	" resdict = dict(zip([fetchAtName(at) for at in residue] , residue))\n",
	" \n",
	" if end=='N':\n",
	" nterm_atom = atom\n",
	" mol = addNTerm(mol, residue, nterm_atom)\n",
	" print(mol.GetNumBonds())\n",
	" if end=='C':\n",
	" cterm_atom = atom\n",
	" mol = addCTerm(mol, residue, cterm_atom)\n",
	" residues = getResidues(mol)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "007581b0-d731-4167-bd09-03d626b1bd78",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"A A A A A A A A A A A A B B B B B B B B C C C C C C C C C C C C C C C C D D D D D D D D D D D D D E E E E E E E E E E E E E E E E E E E E F F F F F F F F F F F G G G G G G G G H H H H H H H H H H H H H I I I I I I I I I I I A A A A A B B B B B C C C C C D D D D D E E E E E F F F F F G G G G G H H H H H I I I I I "
	]
	}
	],
	"source": [
	"# reset the residue information\n",
	"# and make each bonded component a separate chain\n",
	"adj = Chem.GetAdjacencyMatrix(mol)\n",
	"\n",
	"nchains, chain_indices = scipy.sparse.csgraph.connected_components(adj)\n",
	"\n",
	"chains = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'\n",
	"\n",
	"for atom, chain_id in zip(mol.GetAtoms(), chain_indices):\n",
	" pdb_info = atom.GetPDBResidueInfo()\n",
	" chain = chains[chain_id]\n",
	" \n",
	" pdb_info.SetChainId(chain)\n",
	" print(pdb_info.GetChainId(), end=' ')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "96ce35ae-85f1-45d8-8f09-f15b6069950f",
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"new_order = pd.DataFrame({'resid':[fetchResNum(at) for at in mol.GetAtoms()],\n",
	" 'atnum':range(mol.GetNumAtoms())}).sort_values(['resid', 'atnum'])['atnum']\n",
	"\n",
	"reordered = Chem.RenumberAtoms(mol, [int(i) for i in new_order])\n",
	"Chem.MolToPDBFile(reordered,'pdb3.pdb', )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "fdbb0f0a-d709-4d98-87dd-64d7f055ce1e",
	"metadata": {},
	"outputs": [],
	"source": [
	"from openmm import app\n",
	"from openmm import unit\n",
	"import openmm\n",
	"import pdbfixer\n",
	"fixer = pdbfixer.PDBFixer('./pdb3.pdb')\n",
	"#fixer.findMissingResidues()\n",
	"#fixer.findMissingAtoms()\n",
	"#fixer.addMissingAtoms()\n",
	"fixer.addMissingHydrogens(7.0)\n",
	"app.PDBFile.writeFile(fixer.topology, fixer.positions, open('output.pdb', 'w'))\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "23bf57ee-a372-48ff-b4a4-0296a2b4b539",
	"metadata": {},
	"outputs": [],
	"source": [
	"forcefield = app.ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')\n",
	"system = forcefield.createSystem(\n",
	" fixer.topology,\n",
	" nonbondedMethod=app.CutoffNonPeriodic,\n",
	" nonbondedCutoff=0.9*unit.nanometer\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "a8c79fa3-c7bb-4e64-b351-43c755abd845",
	"metadata": {},
	"outputs": [],
	"source": [
	"integrator = openmm.LangevinIntegrator(\n",
	" 298*unit.kelvin,\n",
	" 1/unit.picosecond,\n",
	" 1*unit.femtosecond\n",
	" )\n",
	"platform = openmm.Platform.getPlatformByName('CPU')\n",
	"simulation = app.Simulation(fixer.topology, system, integrator, platform)\n",
	"simulation.context.setPositions(fixer.positions)\n",
	"simulation.minimizeEnergy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "1768799b-7f60-4ced-b196-240e8b100913",
	"metadata": {},
	"outputs": [],
	"source": [
	"simulation.step(1000)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "3190e0be-b661-4be5-93d2-d0874a7a990c",
	"metadata": {},
	"outputs": [],
	"source": [
	"app.PDBFile.writeFile(fixer.topology,\n",
	" simulation.context.getState(getPositions=True).getPositions(),\n",
	" open('./minimized.pdb', 'w'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "64a41e3d-cd0c-4028-998b-98f94cd46658",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}