Skip to content

Instantly share code, notes, and snippets.

@adalke
Created December 8, 2021 13:19
Show Gist options
  • Save adalke/e62df8774032560fef750fa9c88b6516 to your computer and use it in GitHub Desktop.
Save adalke/e62df8774032560fef750fa9c88b6516 to your computer and use it in GitHub Desktop.
Program to generate SMILES strings for short chain chlorinated paraffins (SCCP)
# Program to generate SMILES strings for short chain chlorinated paraffins (SCCP).
# These are alkanes with the molecular formula C_{x} H_{2x-y+2} Cl_{y}
# This implementation finds all unique ways to assign the `y`
# chlorines into `x` bin such that no bin has more than 2 chlorines,
# except the first and last may have 3 chlorines, then use those
# counts to create a SMILES string.
## Step 1: Assign chlorine counts for each carbon atom
# The implementation is recursive.
# For each of the up to 3 ways to assign chlorine counts to the first atom,
# find all of the ways to assign counts for the remaining part of the chain.
# For each of the up to 2 ways to assign chlorine counts to a non-termainal
# chain atom, find the ways to assign counts for the remaining part.
# The base case is either the terminal atom, or no remaining carbons.
# This algorithm does not generate two identical bin assignments.
## Step 2: Determine which counts are unique
# The algorithm in step 1 with num_carbons = 3 and num_chlorines = 6
# creates the following carbon counts:
# A) (1, 2, 3)
# B) (2, 1, 3)
# C) (2, 2, 2)
# D) (3, 0, 3)
# E) (3, 1, 2)
# F) (3, 2, 1)
# The molecules are symmetrical, which means A) and F) describe the
# same molecule, just starting from the other end. Same for B) and E).
# The uniqueness algorithm first sorts the chlorine count assignments
# (which ensures the output is in a well-defined order) then selects
# each assignment in turn, so long as the flipped version has not
# already been picked.
## Step 3: Convert into a SMILES string
# Given chlorine counts for each atom, like (3, 0, 1, 2) the
# corresponding SMILES string can be created syntactically as
# the terms
# "C(Cl)(Cl)(Cl)" +
# "C" +
# "C(Cl)" +
# "C(Cl)(Cl)"
# to give:
# "C(Cl)(Cl)(Cl)CC(Cl)C(Cl)(Cl)"
# The hydrogen counts are determined by the implicit hydrogen count
# rule of SMILES.
# This can further (and optionally) be canonicalized with RDKit, giving:
# "ClC(Cl)C(Cl)CC(Cl)(Cl)Cl"
import sys
import functools
# In my tests, caching (or "memoizing") is about 33% faster
if hasattr(functools, "cache"): # Added to Python 3.9
cache = functools.cache
else:
cache = functools.lru_cache(maxsize=None)
@cache
def _sccp_chain(num_carbons, num_chlorines, max_chlorines_per_atom):
# Figure out the base case, which stops the recursion
if num_carbons == 1:
return [(num_chlorines,)]
if num_chlorines == 0:
return [(0,) * num_carbons]
# This is not terminal, so the max possible chlorine count is 2 or max_chlorines_per_atom
max_num_chlorines_on_atom = min(num_chlorines, 2, max_chlorines_per_atom)
# Can't let the surplus of chlorine atoms become too large
min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1)
chains = []
# For each possible chlorine count assignment for this atom ...
for num_chlorines_on_atom in range(
min_num_chlorines_on_atom,
max_num_chlorines_on_atom + 1):
# .. recursively extend the chain
for subchain in _sccp_chain(
num_carbons - 1,
num_chlorines - num_chlorines_on_atom,
max_chlorines_per_atom,
):
chain = (num_chlorines_on_atom,) + subchain
chains.append(chain)
return chains
def get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3):
# Check for possible out-of-range conditions
if num_carbons < 1:
raise ValueError("Must have at least one carbon")
if num_chlorines < 0:
raise ValueError("May not have a negative number of chlorines")
if (2*num_carbons - num_chlorines + 2) < 0:
raise ValueError("Too many chlorines for the number of carbons")
# Special case a single carbon (all chlorines on that atom)
if num_carbons == 1:
return [(num_chlorines,)]
# Special case no chlorines
if num_chlorines == 0:
return [(0,) * num_carbons]
# The first atom has a max possible chlorine count of 3 or max_chlorines_per_atom
max_num_chorines_on_atom = min(num_chlorines, 3, max_chlorines_per_atom)
# Can't let the surplus of chlorine atoms become too large
min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1)
chains = []
# For each possible chlorine count assignment for the first atom ...
for num_chlorines_on_first_atom in range(
min_num_chlorines_on_atom,
max_num_chorines_on_atom + 1):
# .. extend by the possible ways to make an sscp chain which
# is one carbon shorter and without the chlorines assigned
# to the first atom.
for subchain in _sccp_chain(
num_carbons - 1,
num_chlorines - num_chlorines_on_first_atom,
max_chlorines_per_atom):
chain = (num_chlorines_on_first_atom,) + subchain
chains.append(chain)
return chains
def get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3):
nonunique_chains = get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom)
# Sort them so the highest chlorine counts are on the right.
# (This is mostly to make the result be visually pleasing.)
nonunique_chains.sort()
unique_chains = []
seen = set()
for chain in nonunique_chains:
# Ensure uniqueness, including symmetry
if chain in seen:
continue
# Exclude its flipped version
seen.add(chain[::-1])
unique_chains.append(chain)
return unique_chains
def get_sccp_smiles(num_carbons, num_chlorines, max_chlorines_per_atom=3, canonicalize=True):
if canonicalize:
# Only need RDKit if we want to generate canonical SMILES.
from rdkit.Chem import CanonSmiles
smiles_list = []
unique_chains = get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom)
for chain in unique_chains:
smiles_terms = []
for num_chlorines_on_atom in chain:
smiles_terms.append("C" + "(Cl)" * num_chlorines_on_atom)
smiles = "".join(smiles_terms)
if canonicalize:
smiles = CanonSmiles(smiles)
smiles_list.append(smiles)
if canonicalize:
# Double-check the unique generation is actually unique
assert len(smiles_list) == len(set(smiles_list))
return smiles_list
import argparse
parser = argparse.ArgumentParser(
description = "Generate SCCP SMILES given the number of carbons and chlorines",
)
parser.add_argument(
"--min-C",
"--C",
metavar = "N",
type = int,
help = "The minimum number of carbons",
)
parser.add_argument(
"--max-C",
metavar = "N",
type = int,
help = "The maximum number of carbons",
)
parser.add_argument(
"--min-Cl",
"--Cl",
metavar = "N",
type = int,
help = "The minimum number of chlorines",
)
parser.add_argument(
"--max-Cl",
metavar = "N",
type = int,
help = "The maximum number of chlorines",
)
parser.add_argument(
"--max-Cl-per-atom",
type = int,
choices = (1, 2, 3, 4),
default = 4,
help = "maximum number of chlorines per atom",
)
parser.add_argument(
"--min-content",
"--content",
metavar = "FLT",
type = float,
help = "The minimum chlorine content by mass, as a float",
)
parser.add_argument(
"--max-content",
metavar = "FLT",
type = float,
help = "The maximum chlorine content by mass, as a float",
)
parser.add_argument(
"--no-canonicalize",
action = "store_false",
default = True,
dest = "canonicalize",
help = "Use --no-canonicalize to skip SMILES canonicalization",
)
def check_C_range(parser, min_C, max_C):
if min_C is None:
if max_C is None:
sys.stderr.write("C range not specified. Using --min-C 10 and --max-C 10\n")
min_C = max_C = 10
elif max_C < 1:
parser.error(f"--max-C must be positive")
else:
min_C = 1
else:
if min_C < 1:
parser.error(f"--min-C must be positive")
if max_C is None:
max_C = min_C
elif max_C < min_C:
parser.error(f"--min-C must not be larger than --max-C")
return min_C, max_C
def check_Cl_range(parser, min_Cl, max_Cl, max_C):
if min_Cl is None:
if max_Cl is None:
# Use the maximum possible range
min_Cl = 1
max_Cl = 2*max_C + 2
elif max_Cl < 1:
parser.error(f"--max-Cl must be positive")
else:
min_Cl = 1
else:
if min_Cl < 0:
parser.error(f"--min-Cl must be non-negative")
if max_Cl is None:
max_Cl = min_Cl
elif max_Cl < min_Cl:
parser.error(f"--min-Cl must not be larger than --max-Cl")
return min_Cl, max_Cl
def check_content_range(parser, min_content, max_content):
if min_content is None:
if max_content is None:
min_content, max_content = 0.4, 0.7
sys.stderr.write(
f"Content range not specified. Using --min-content {min_content} "
f"and --max-content {max_content}.\n")
else:
min_content = 0.0
sys.stderr.write(
f"--min-content not specified. Using {min_content}.\n"
)
elif max_content is None:
max_content = 1.0
sys.stderr.write(
f"--max-content not specified. Using {max_content}.\n"
)
if not (0.0 <= min_content <= 1.0):
parser.error("--min-content must be betweeen 0.0 and 1.0, inclusive")
if not (0.0 <= max_content <= 1.0):
parser.error("--max-content must be betweeen 0.0 and 1.0, inclusive")
if not (min_content <= max_content):
parser.error("--min-content must not be larger than --max-content")
return min_content, max_content
WEIGHT_C = 12.011
WEIGHT_H = 1.008
WEIGHT_Cl = 35.45
def get_chlorine_content(num_carbons, num_chlorines):
num_hydrogens = 2*num_carbons - num_chlorines + 2
mw_Cl = WEIGHT_Cl * num_chlorines
mw_total = (WEIGHT_C * num_carbons + WEIGHT_H * num_hydrogens + mw_Cl)
return mw_Cl / mw_total
def main():
args = parser.parse_args()
min_C, max_C = check_C_range(parser, args.min_C, args.max_C)
min_Cl, max_Cl = check_Cl_range(parser, args.min_Cl, args.max_Cl, max_C)
min_content, max_content = check_content_range(parser, args.min_content, args.max_content)
for num_carbons in range(min_C, max_C+1):
for num_chlorines in range(min_Cl, max_Cl+1):
if (2*num_carbons - num_chlorines + 2) < 0:
continue
content = get_chlorine_content(num_carbons, num_chlorines)
if content > max_content:
break
if content < min_content:
continue
smiles_list = get_sccp_smiles(
num_carbons,
num_chlorines,
args.max_Cl_per_atom,
args.canonicalize,
)
for smiles in smiles_list:
print(smiles)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment