Created
December 8, 2021 13:19
-
-
Save adalke/e62df8774032560fef750fa9c88b6516 to your computer and use it in GitHub Desktop.
Program to generate SMILES strings for short chain chlorinated paraffins (SCCP)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Program to generate SMILES strings for short chain chlorinated paraffins (SCCP). | |
# These are alkanes with the molecular formula C_{x} H_{2x-y+2} Cl_{y} | |
# This implementation finds all unique ways to assign the `y` | |
# chlorines into `x` bin such that no bin has more than 2 chlorines, | |
# except the first and last may have 3 chlorines, then use those | |
# counts to create a SMILES string. | |
## Step 1: Assign chlorine counts for each carbon atom | |
# The implementation is recursive. | |
# For each of the up to 3 ways to assign chlorine counts to the first atom, | |
# find all of the ways to assign counts for the remaining part of the chain. | |
# For each of the up to 2 ways to assign chlorine counts to a non-termainal | |
# chain atom, find the ways to assign counts for the remaining part. | |
# The base case is either the terminal atom, or no remaining carbons. | |
# This algorithm does not generate two identical bin assignments. | |
## Step 2: Determine which counts are unique | |
# The algorithm in step 1 with num_carbons = 3 and num_chlorines = 6 | |
# creates the following carbon counts: | |
# A) (1, 2, 3) | |
# B) (2, 1, 3) | |
# C) (2, 2, 2) | |
# D) (3, 0, 3) | |
# E) (3, 1, 2) | |
# F) (3, 2, 1) | |
# The molecules are symmetrical, which means A) and F) describe the | |
# same molecule, just starting from the other end. Same for B) and E). | |
# The uniqueness algorithm first sorts the chlorine count assignments | |
# (which ensures the output is in a well-defined order) then selects | |
# each assignment in turn, so long as the flipped version has not | |
# already been picked. | |
## Step 3: Convert into a SMILES string | |
# Given chlorine counts for each atom, like (3, 0, 1, 2) the | |
# corresponding SMILES string can be created syntactically as | |
# the terms | |
# "C(Cl)(Cl)(Cl)" + | |
# "C" + | |
# "C(Cl)" + | |
# "C(Cl)(Cl)" | |
# to give: | |
# "C(Cl)(Cl)(Cl)CC(Cl)C(Cl)(Cl)" | |
# The hydrogen counts are determined by the implicit hydrogen count | |
# rule of SMILES. | |
# This can further (and optionally) be canonicalized with RDKit, giving: | |
# "ClC(Cl)C(Cl)CC(Cl)(Cl)Cl" | |
import sys | |
import functools | |
# In my tests, caching (or "memoizing") is about 33% faster | |
if hasattr(functools, "cache"): # Added to Python 3.9 | |
cache = functools.cache | |
else: | |
cache = functools.lru_cache(maxsize=None) | |
@cache | |
def _sccp_chain(num_carbons, num_chlorines, max_chlorines_per_atom): | |
# Figure out the base case, which stops the recursion | |
if num_carbons == 1: | |
return [(num_chlorines,)] | |
if num_chlorines == 0: | |
return [(0,) * num_carbons] | |
# This is not terminal, so the max possible chlorine count is 2 or max_chlorines_per_atom | |
max_num_chlorines_on_atom = min(num_chlorines, 2, max_chlorines_per_atom) | |
# Can't let the surplus of chlorine atoms become too large | |
min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1) | |
chains = [] | |
# For each possible chlorine count assignment for this atom ... | |
for num_chlorines_on_atom in range( | |
min_num_chlorines_on_atom, | |
max_num_chlorines_on_atom + 1): | |
# .. recursively extend the chain | |
for subchain in _sccp_chain( | |
num_carbons - 1, | |
num_chlorines - num_chlorines_on_atom, | |
max_chlorines_per_atom, | |
): | |
chain = (num_chlorines_on_atom,) + subchain | |
chains.append(chain) | |
return chains | |
def get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3): | |
# Check for possible out-of-range conditions | |
if num_carbons < 1: | |
raise ValueError("Must have at least one carbon") | |
if num_chlorines < 0: | |
raise ValueError("May not have a negative number of chlorines") | |
if (2*num_carbons - num_chlorines + 2) < 0: | |
raise ValueError("Too many chlorines for the number of carbons") | |
# Special case a single carbon (all chlorines on that atom) | |
if num_carbons == 1: | |
return [(num_chlorines,)] | |
# Special case no chlorines | |
if num_chlorines == 0: | |
return [(0,) * num_carbons] | |
# The first atom has a max possible chlorine count of 3 or max_chlorines_per_atom | |
max_num_chorines_on_atom = min(num_chlorines, 3, max_chlorines_per_atom) | |
# Can't let the surplus of chlorine atoms become too large | |
min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1) | |
chains = [] | |
# For each possible chlorine count assignment for the first atom ... | |
for num_chlorines_on_first_atom in range( | |
min_num_chlorines_on_atom, | |
max_num_chorines_on_atom + 1): | |
# .. extend by the possible ways to make an sscp chain which | |
# is one carbon shorter and without the chlorines assigned | |
# to the first atom. | |
for subchain in _sccp_chain( | |
num_carbons - 1, | |
num_chlorines - num_chlorines_on_first_atom, | |
max_chlorines_per_atom): | |
chain = (num_chlorines_on_first_atom,) + subchain | |
chains.append(chain) | |
return chains | |
def get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3): | |
nonunique_chains = get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom) | |
# Sort them so the highest chlorine counts are on the right. | |
# (This is mostly to make the result be visually pleasing.) | |
nonunique_chains.sort() | |
unique_chains = [] | |
seen = set() | |
for chain in nonunique_chains: | |
# Ensure uniqueness, including symmetry | |
if chain in seen: | |
continue | |
# Exclude its flipped version | |
seen.add(chain[::-1]) | |
unique_chains.append(chain) | |
return unique_chains | |
def get_sccp_smiles(num_carbons, num_chlorines, max_chlorines_per_atom=3, canonicalize=True): | |
if canonicalize: | |
# Only need RDKit if we want to generate canonical SMILES. | |
from rdkit.Chem import CanonSmiles | |
smiles_list = [] | |
unique_chains = get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom) | |
for chain in unique_chains: | |
smiles_terms = [] | |
for num_chlorines_on_atom in chain: | |
smiles_terms.append("C" + "(Cl)" * num_chlorines_on_atom) | |
smiles = "".join(smiles_terms) | |
if canonicalize: | |
smiles = CanonSmiles(smiles) | |
smiles_list.append(smiles) | |
if canonicalize: | |
# Double-check the unique generation is actually unique | |
assert len(smiles_list) == len(set(smiles_list)) | |
return smiles_list | |
import argparse | |
parser = argparse.ArgumentParser( | |
description = "Generate SCCP SMILES given the number of carbons and chlorines", | |
) | |
parser.add_argument( | |
"--min-C", | |
"--C", | |
metavar = "N", | |
type = int, | |
help = "The minimum number of carbons", | |
) | |
parser.add_argument( | |
"--max-C", | |
metavar = "N", | |
type = int, | |
help = "The maximum number of carbons", | |
) | |
parser.add_argument( | |
"--min-Cl", | |
"--Cl", | |
metavar = "N", | |
type = int, | |
help = "The minimum number of chlorines", | |
) | |
parser.add_argument( | |
"--max-Cl", | |
metavar = "N", | |
type = int, | |
help = "The maximum number of chlorines", | |
) | |
parser.add_argument( | |
"--max-Cl-per-atom", | |
type = int, | |
choices = (1, 2, 3, 4), | |
default = 4, | |
help = "maximum number of chlorines per atom", | |
) | |
parser.add_argument( | |
"--min-content", | |
"--content", | |
metavar = "FLT", | |
type = float, | |
help = "The minimum chlorine content by mass, as a float", | |
) | |
parser.add_argument( | |
"--max-content", | |
metavar = "FLT", | |
type = float, | |
help = "The maximum chlorine content by mass, as a float", | |
) | |
parser.add_argument( | |
"--no-canonicalize", | |
action = "store_false", | |
default = True, | |
dest = "canonicalize", | |
help = "Use --no-canonicalize to skip SMILES canonicalization", | |
) | |
def check_C_range(parser, min_C, max_C): | |
if min_C is None: | |
if max_C is None: | |
sys.stderr.write("C range not specified. Using --min-C 10 and --max-C 10\n") | |
min_C = max_C = 10 | |
elif max_C < 1: | |
parser.error(f"--max-C must be positive") | |
else: | |
min_C = 1 | |
else: | |
if min_C < 1: | |
parser.error(f"--min-C must be positive") | |
if max_C is None: | |
max_C = min_C | |
elif max_C < min_C: | |
parser.error(f"--min-C must not be larger than --max-C") | |
return min_C, max_C | |
def check_Cl_range(parser, min_Cl, max_Cl, max_C): | |
if min_Cl is None: | |
if max_Cl is None: | |
# Use the maximum possible range | |
min_Cl = 1 | |
max_Cl = 2*max_C + 2 | |
elif max_Cl < 1: | |
parser.error(f"--max-Cl must be positive") | |
else: | |
min_Cl = 1 | |
else: | |
if min_Cl < 0: | |
parser.error(f"--min-Cl must be non-negative") | |
if max_Cl is None: | |
max_Cl = min_Cl | |
elif max_Cl < min_Cl: | |
parser.error(f"--min-Cl must not be larger than --max-Cl") | |
return min_Cl, max_Cl | |
def check_content_range(parser, min_content, max_content): | |
if min_content is None: | |
if max_content is None: | |
min_content, max_content = 0.4, 0.7 | |
sys.stderr.write( | |
f"Content range not specified. Using --min-content {min_content} " | |
f"and --max-content {max_content}.\n") | |
else: | |
min_content = 0.0 | |
sys.stderr.write( | |
f"--min-content not specified. Using {min_content}.\n" | |
) | |
elif max_content is None: | |
max_content = 1.0 | |
sys.stderr.write( | |
f"--max-content not specified. Using {max_content}.\n" | |
) | |
if not (0.0 <= min_content <= 1.0): | |
parser.error("--min-content must be betweeen 0.0 and 1.0, inclusive") | |
if not (0.0 <= max_content <= 1.0): | |
parser.error("--max-content must be betweeen 0.0 and 1.0, inclusive") | |
if not (min_content <= max_content): | |
parser.error("--min-content must not be larger than --max-content") | |
return min_content, max_content | |
WEIGHT_C = 12.011 | |
WEIGHT_H = 1.008 | |
WEIGHT_Cl = 35.45 | |
def get_chlorine_content(num_carbons, num_chlorines): | |
num_hydrogens = 2*num_carbons - num_chlorines + 2 | |
mw_Cl = WEIGHT_Cl * num_chlorines | |
mw_total = (WEIGHT_C * num_carbons + WEIGHT_H * num_hydrogens + mw_Cl) | |
return mw_Cl / mw_total | |
def main(): | |
args = parser.parse_args() | |
min_C, max_C = check_C_range(parser, args.min_C, args.max_C) | |
min_Cl, max_Cl = check_Cl_range(parser, args.min_Cl, args.max_Cl, max_C) | |
min_content, max_content = check_content_range(parser, args.min_content, args.max_content) | |
for num_carbons in range(min_C, max_C+1): | |
for num_chlorines in range(min_Cl, max_Cl+1): | |
if (2*num_carbons - num_chlorines + 2) < 0: | |
continue | |
content = get_chlorine_content(num_carbons, num_chlorines) | |
if content > max_content: | |
break | |
if content < min_content: | |
continue | |
smiles_list = get_sccp_smiles( | |
num_carbons, | |
num_chlorines, | |
args.max_Cl_per_atom, | |
args.canonicalize, | |
) | |
for smiles in smiles_list: | |
print(smiles) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment