Created
June 10, 2024 12:51
-
-
Save matteoferla/b517f1a9b6c196e1312ce1511ad0407c to your computer and use it in GitHub Desktop.
MMFF94 AtomTypes from RDKit
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The values from https://towhee.sourceforge.net/forcefields/mmff94.html | |
were extracted and compared to https://raw.githubusercontent.com/rdkit/rdkit/master/Code/GraphMol/ForceFieldHelpers/MMFF/AtomTyper.cpp | |
NB. Do not ask ChatGTP4 for the atomtypes as they will be wrong. | |
""" | |
atomtype2namedex: Dict[int, str] = {"1": {"CR ": "ALKYL CARBON, SP3"}, "2": {"C=C ": "VINYLIC CARBON, SP2", "CSP2": "GENERIC SP2 CARBON"}, "3": {"C=O ": "GENERAL CARBONYL CARBON", "C=N ": "SP2 CARBON IN C=N", "CGD ": "GUANIDINE CARBON, DOUBLY BONDED TO N", "C=OR": "KETONE OR ALDEHYDE CARBONYL CARBON", "C=ON": "AMIDE CARBONYL CARBON", "CONN": "UREA CARBONYL CARBON", "COO ": "CARBOXYLIC ACID OR ESTER CARBONYL CARBON", "COON": "CARBAMATE CARBONYL CARBON", "COOO": "C ARBONIC ACID OR ESTER CARBONYL CARBON", "C=OS": "THIOESTER CARBONYL CARBON, DOUBLE BONDED TO O", "C=S ": "THIOESTER CARBON, DOUBLY BONDED TO S", "C=SN": "THIOAMIDE, CARBON, DOUBLY BONDED TO S", "CSO2": "CARBON IN >C=SO2", "CS=O": "CARBON IN >C=S=O (SULFINYL GROUP)", "CSS ": "THIOCARBOXYLIC ACID OR ESTER CARBONYL CARBON", "C=P ": "CARBON DOUBLE BONDED TO PHOSPHOROUS"}, "4": {"CSP ": "ACETYLENIC CARBON", "=C= ": "ALLENIC CARBON"}, "5": {"HC ": "H ATTACHED TO C", "HSI ": "H ATTACHED TO SI"}, "11": {"F ": "FLUORINE"}, "12": {"CL ": "CHLORINE"}, "13": {"BR ": "BROMINE"}, "14": {"I ": "IODINE"}, "20": {"CR4R": "CARBON IN 4-MEMBERED RINGS"}, "21": {"HOR ": "HYDROGEN IN ALCOHOLS", "HO ": "GENERAL H ON OXYGEN", "HOM ": "HYDROGEN IN HYDROXIDE ANION"}, "22": {"CR3R": "CARBON IN A 3-MEMBERED RING"}, "23": {"HNR ": "H-N(SP3)", "H3N ": "H-N(SP3), AMMONIA", "HPYL": "H-N IN PYRROLE", "HNOX": "H-N IN IN A N-OXIDE", "HNM ": "H ON DICOORD, NEGATIVELY CHARGED NITROGEN", "HN ": "GENERAL H ON NITROGEN"}, "24": {"HOCO": "H-O IN CARBOXYLIC ACIDS", "HOP ": "HYDROGEN ON OXYGEN ATTACHED TO PHOSPHOROUS"}, "27": {"HN=N": "AZO HYDROGEN", "HN=C": "IMINE HYDROGEN"}, "28": {"HNCO": "AMIDE HYDROGEN", "HNCS": "THIOAMIDE HYDROGEN", "HNCC": "H-N IN ENAMINES", "HNCN": "H-N IN H-N-C=N", "HNNC": "H-N IN H-N-N=C", "HNNN": "H-N IN H-N-N=N", "HNSO": "H-N IN SULFONAMIDE", "HNPO": "H-N IN PHOSPHONAMIDE", "HNC%": "HYDROGEN ON N ATTACHED TO TRIPLY BONDED CARBON", "HSP2": "GENERAL H ON SP2 NITROGEN"}, "29": {"HOCC": "H-O IN ENOLS AND PHENOLS", "HOCN": "H-O IN HO-C=N"}, "30": {"CE4R": "OLEFINIC CARBON IN 4-MEMBERED RINGS"}, "31": {"HOH ": "HYDROGEN IN H2O"}, "33": {"HOS ": "H ON OXYGEN ATTACHED TO SULFUR"}, "36": {"HNR+": "H ON QUATERNARY NITROGEN", "HIM+": "H ON IMIDAZOLIUM-TYPE NITROGEN", "HPD+": "H ON PROTONATED PYRIDINE NITROGEN", "HNN+": "H ON AMIDINIUM-TYPE NITROGEN", "HNC+": "H ON PROTONATED IMINE NITROGEN", "HGD+": "H ON GUANIDINIUM-TYPE NITROGEN", "HN5+": "H ON N5+, N5A+ OR N5B+"}, "37": {"CB ": "CARBON AS IN BENZENE, PYRROLE"}, "41": {"CO2M": "CARBOXYLATE ANION CARBON (base charge -0.5)", "CS2M": "CARBON IN THIOCARBOXYLATE ANION"}, "50": {"HO+ ": "HYDROGEN ON O+ OXYGEN"}, "52": {"HO=+": "HYDROGEN ON OXENIUM OXYGEN"}, "57": {"CGD+": "GUANIDINIUM CARBON", "CNN+": "C IN +N=C-N RESONANCE STRUCTURES"}, "60": {"C% ": "ISONITRILE CARBON"}, "63": {"C5A ": "ALPHA CARBON IN 5-MEMBERED HETEROAROMATIC RING"}, "64": {"C5B ": "BETA CARBON IN 5-MEMBERED HETEROAROMATIC RING"}, "71": {"HS ": "H ATTACHED TO DIVALENT, DICOORDINATE S", "HS=N": "H ATTACHED TO TETRAVALENT, TRICOODR S DBL BONDED", "HP ": "H ATTACHED TO TRI- OR TETRACOORDINATE PHOSPHORUS"}, "77": {"CLO4": "CHLORINE IN PERCHLORATE ANION, CLO4(-)"}, "78": {"C5 ": "GENERAL CARBON IN 5-MEMBERED HETEROAROMATIC RING"}, "80": {"CIM+": "C IN N-C-N IN IMIDAZOLIUM ION"}, "87": {"FE+2": "IRON +2 CATION"}, "88": {"FE+3": "IROM +3 CATION"}, "89": {"F- ": "FLUORIDE ANION"}, "90": {"CL- ": "CHLORIDE ANION"}, "91": {"BR- ": "BROMIDE ANION"}, "96": {"CA+2": "DIPOSITIVE CALCIUM"}, "97": {"CU+1": "MONOPOSITIVE COPPER"}, "98": {"CU+2": "DIPOSITIVE COPPER"}} | |
atomtype2symbol: Dict[int, Dict[str, str]] = {"1": "Carbon", "2": "Carbon", "3": "Carbon", "4": "Carbon", "5": "Hydrogen", "11": "Fluorine", "12": "Chlorine", "13": "Bromine", "14": "Iodine", "20": "Carbon", "21": "Hydrogen", "22": "Carbon", "23": "Hydrogen", "24": "Hydrogen", "27": "Hydrogen", "28": "Hydrogen", "29": "Hydrogen", "30": "Carbon", "31": "Hydrogen", "33": "Hydrogen", "36": "Hydrogen", "37": "Carbon", "41": "Carbon", "50": "Hydrogen", "52": "Hydrogen", "57": "Carbon", "60": "Carbon", "63": "Carbon", "64": "Carbon", "71": "Hydrogen", "77": "Chlorine", "78": "Carbon", "80": "Carbon", "87": "Iron", "88": "Iron", "89": "Fluorine", "90": "Chlorine", "91": "Bromine", "96": "Calcium", "97": "Copper", "98": "Copper"} | |
atomtype2shortnames: Dict[int, List[str]] = {i: list(map(str.strip, d.keys())) for i,d in atomtype2namedex.items()} | |
atomtype2shortname: Dict[int, str] = {i: '|'.join(list(map(str.strip, d.keys()))) for i,d in atomtype2namedex.items()} | |
# ------------------------------------------------------------------ | |
from rdkit import Chem | |
from rdkit.Chem import AllChem | |
from typing import Sequence | |
def get_atomtypes(mol: Chem.Mol) -> Sequence[int]: | |
""" | |
Given a Chem.Mol get the list of MMFF94 atom types (numbers) of the atoms | |
Those numbers can be used with the dict ``atomtype2shortnames`` say. | |
""" | |
# AllChem.MMFFGetMoleculeProperties.GetMMFFAtomType return a number! | |
n_atoms = mol.GetNumAtoms() | |
mol = AllChem.AddHs(mol) | |
AllChem.EmbedMolecule(mol) | |
p: AllChem.MMFFGetMoleculeProperties = AllChem.MMFFGetMoleculeProperties(mol, 'MMFF94') | |
return (*map(p.GetMMFFAtomType, range(mol.GetNumAtoms())),)[:n_atoms] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment