adalke · December 8, 2021 13:19
diff --git a/sccp_smiles.py b/sccp_smiles.py
 # Program to generate SMILES strings for short chain chlorinated paraffins (SCCP).

 # These are alkanes with the molecular formula C_{x} H_{2x-y+2} Cl_{y}

 # This implementation finds all unique ways to assign the `y`
 # chlorines into `x` bin such that no bin has more than 2 chlorines,
 # except the first and last may have 3 chlorines, then use those
 # counts to create a SMILES string.

 ## Step 1: Assign chlorine counts for each carbon atom

 # The implementation is recursive.

 # For each of the up to 3 ways to assign chlorine counts to the first atom,
 # find all of the ways to assign counts for the remaining part of the chain.

 # For each of the up to 2 ways to assign chlorine counts to a non-termainal
 # chain atom, find the ways to assign counts for the remaining part.

 # The base case is either the terminal atom, or no remaining carbons.

 # This algorithm does not generate two identical bin assignments.

 ## Step 2: Determine which counts are unique

 # The algorithm in step 1 with num_carbons = 3 and num_chlorines = 6
 # creates the following carbon counts:
 #    A)   (1, 2, 3)
 #    B)   (2, 1, 3)
 #    C)   (2, 2, 2)
 #    D)   (3, 0, 3)
 #    E)   (3, 1, 2)
 #    F)   (3, 2, 1)

 # The molecules are symmetrical, which means A) and F) describe the
 # same molecule, just starting from the other end. Same for B) and E).

 # The uniqueness algorithm first sorts the chlorine count assignments
 # (which ensures the output is in a well-defined order) then selects
 # each assignment in turn, so long as the flipped version has not
 # already been picked.

 ## Step 3: Convert into a SMILES string

 # Given chlorine counts for each atom, like (3, 0, 1, 2) the
 # corresponding SMILES string can be created syntactically as
 # the terms
 #   "C(Cl)(Cl)(Cl)" +
 #   "C" +
 #   "C(Cl)" +
 #   "C(Cl)(Cl)"
 # to give:
 #   "C(Cl)(Cl)(Cl)CC(Cl)C(Cl)(Cl)"

 # The hydrogen counts are determined by the implicit hydrogen count
 # rule of SMILES.

 # This can further (and optionally) be canonicalized with RDKit, giving:
 #   "ClC(Cl)C(Cl)CC(Cl)(Cl)Cl"

 import sys
 import functools

 # In my tests, caching (or "memoizing") is about 33% faster
 if hasattr(functools, "cache"): # Added to Python 3.9
    cache = functools.cache
 else:
    cache = functools.lru_cache(maxsize=None)
    
 @cache
 def _sccp_chain(num_carbons, num_chlorines, max_chlorines_per_atom):
    # Figure out the base case, which stops the recursion
    if num_carbons == 1:
        return [(num_chlorines,)]
    if num_chlorines == 0:
        return [(0,) * num_carbons]
    
    # This is not terminal, so the max possible chlorine count is 2 or max_chlorines_per_atom
    max_num_chlorines_on_atom = min(num_chlorines, 2, max_chlorines_per_atom)
    # Can't let the surplus of chlorine atoms become too large
    min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1)

    chains = []

    # For each possible chlorine count assignment for this atom ...
    for num_chlorines_on_atom in range(
            min_num_chlorines_on_atom,
            max_num_chlorines_on_atom + 1):
        # .. recursively extend the chain
        for subchain in _sccp_chain(
                num_carbons - 1,
                num_chlorines - num_chlorines_on_atom,
                max_chlorines_per_atom,
                ):
            chain = (num_chlorines_on_atom,) + subchain
            chains.append(chain)
            
    return chains

 def get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3):
    # Check for possible out-of-range conditions
    if num_carbons < 1:
        raise ValueError("Must have at least one carbon")
    if num_chlorines < 0:
        raise ValueError("May not have a negative number of chlorines")
    if (2*num_carbons - num_chlorines + 2) < 0:
        raise ValueError("Too many chlorines for the number of carbons")

    # Special case a single carbon (all chlorines on that atom)
    if num_carbons == 1:
        return [(num_chlorines,)]
    # Special case no chlorines
    if num_chlorines == 0:
        return [(0,) * num_carbons]

    # The first atom has a max possible chlorine count of 3 or max_chlorines_per_atom
    max_num_chorines_on_atom = min(num_chlorines, 3, max_chlorines_per_atom)
    # Can't let the surplus of chlorine atoms become too large
    min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1)
    
    chains = []
    
    # For each possible chlorine count assignment for the first atom ...
    for num_chlorines_on_first_atom in range(
            min_num_chlorines_on_atom,
            max_num_chorines_on_atom + 1):
        # .. extend by the possible ways to make an sscp chain which
        # is one carbon shorter and without the chlorines assigned
        # to the first atom.
        for subchain in _sccp_chain(
                num_carbons - 1,
                num_chlorines - num_chlorines_on_first_atom,
                max_chlorines_per_atom):
            chain = (num_chlorines_on_first_atom,) + subchain
            chains.append(chain)

    return chains

 def get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3):
    nonunique_chains = get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom)
    # Sort them so the highest chlorine counts are on the right.
    # (This is mostly to make the result be visually pleasing.)
    nonunique_chains.sort()
    
    unique_chains = []
    seen = set()
    for chain in nonunique_chains:
        # Ensure uniqueness, including symmetry
        if chain in seen:
            continue
        # Exclude its flipped version
        seen.add(chain[::-1])
        
        unique_chains.append(chain)

    return unique_chains

 def get_sccp_smiles(num_carbons, num_chlorines, max_chlorines_per_atom=3, canonicalize=True):
    if canonicalize:
        # Only need RDKit if we want to generate canonical SMILES.
        from rdkit.Chem import CanonSmiles

    smiles_list = []
    unique_chains = get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom)
    for chain in unique_chains:
        smiles_terms = []
        for num_chlorines_on_atom in chain:
            smiles_terms.append("C" + "(Cl)" * num_chlorines_on_atom)
        smiles = "".join(smiles_terms)
        
        if canonicalize:
            smiles = CanonSmiles(smiles)
            
        smiles_list.append(smiles)

    if canonicalize:
        # Double-check the unique generation is actually unique
        assert len(smiles_list) == len(set(smiles_list))
        
    return smiles_list

 import argparse
 parser = argparse.ArgumentParser(
    description = "Generate SCCP SMILES given the number of carbons and chlorines",
    )
 parser.add_argument(
    "--min-C",
    "--C",
    metavar = "N",
    type = int,
    help = "The minimum number of carbons",
    )
 parser.add_argument(
    "--max-C",
    metavar = "N",
    type = int,
    help = "The maximum number of carbons",
    )
 parser.add_argument(
    "--min-Cl",
    "--Cl",
    metavar = "N",
    type = int,
    help = "The minimum number of chlorines",
    )
 parser.add_argument(
    "--max-Cl",
    metavar = "N",
    type = int,
    help = "The maximum number of chlorines",
    )
 parser.add_argument(
    "--max-Cl-per-atom",
    type = int,
    choices = (1, 2, 3, 4),
    default = 4,
    help = "maximum number of chlorines per atom",
    )
    
 parser.add_argument(
    "--min-content",
    "--content",
    metavar = "FLT",
    type = float,
    help = "The minimum chlorine content by mass, as a float",
    )
 parser.add_argument(
    "--max-content",
    metavar = "FLT",
    type = float,
    help = "The maximum chlorine content by mass, as a float",
    )
 parser.add_argument(
    "--no-canonicalize",
    action = "store_false",
    default = True,
    dest = "canonicalize",
    help = "Use --no-canonicalize to skip SMILES canonicalization",
    )

 def check_C_range(parser, min_C, max_C):
    if min_C is None:
        if max_C is None:
            sys.stderr.write("C range not specified. Using --min-C 10 and --max-C 10\n")
            min_C = max_C = 10
        elif max_C < 1:
            parser.error(f"--max-C must be positive")
        else:
            min_C = 1
    else:
        if min_C < 1:
            parser.error(f"--min-C must be positive")
        if max_C is None:
            max_C = min_C
        elif max_C < min_C:
            parser.error(f"--min-C must not be larger than --max-C")

    return min_C, max_C

 def check_Cl_range(parser, min_Cl, max_Cl, max_C):
    if min_Cl is None:
        if max_Cl is None:
            # Use the maximum possible range
            min_Cl = 1
            max_Cl = 2*max_C + 2
        elif max_Cl < 1:
            parser.error(f"--max-Cl must be positive")
        else:
            min_Cl = 1
    else:
        if min_Cl < 0:
            parser.error(f"--min-Cl must be non-negative")
        if max_Cl is None:
            max_Cl = min_Cl
        elif max_Cl < min_Cl:
            parser.error(f"--min-Cl must not be larger than --max-Cl")

    return min_Cl, max_Cl

 def check_content_range(parser, min_content, max_content):
    if min_content is None:
        if max_content is None:
            min_content, max_content = 0.4, 0.7
            sys.stderr.write(
                f"Content range not specified. Using --min-content {min_content} "
                f"and --max-content {max_content}.\n")
        else:
            min_content = 0.0
            sys.stderr.write(
                f"--min-content not specified. Using {min_content}.\n"
                )
    elif max_content is None:
        max_content = 1.0
        sys.stderr.write(
            f"--max-content not specified. Using {max_content}.\n"
            )
        
    if not (0.0 <= min_content <= 1.0):
        parser.error("--min-content must be betweeen 0.0 and 1.0, inclusive")
    if not (0.0 <= max_content <= 1.0):
        parser.error("--max-content must be betweeen 0.0 and 1.0, inclusive")
    if not (min_content <= max_content):
        parser.error("--min-content must not be larger than --max-content")

    return min_content, max_content

 WEIGHT_C = 12.011
 WEIGHT_H = 1.008
 WEIGHT_Cl = 35.45

 def get_chlorine_content(num_carbons, num_chlorines):
    num_hydrogens = 2*num_carbons - num_chlorines + 2
    mw_Cl = WEIGHT_Cl * num_chlorines
    mw_total = (WEIGHT_C * num_carbons + WEIGHT_H * num_hydrogens + mw_Cl)
    return mw_Cl / mw_total

 def main():
    args = parser.parse_args()
    min_C, max_C = check_C_range(parser, args.min_C, args.max_C)
    min_Cl, max_Cl = check_Cl_range(parser, args.min_Cl, args.max_Cl, max_C)
    min_content, max_content = check_content_range(parser, args.min_content, args.max_content)

    for num_carbons in range(min_C, max_C+1):
        for num_chlorines in range(min_Cl, max_Cl+1):
            if (2*num_carbons - num_chlorines + 2) < 0:
                continue
            content = get_chlorine_content(num_carbons, num_chlorines)
            if content > max_content:
                break
            if content < min_content:
                continue
            smiles_list = get_sccp_smiles(
                num_carbons,
                num_chlorines,
                args.max_Cl_per_atom,
                args.canonicalize,
                )
            for smiles in smiles_list:
                print(smiles)

 if __name__ == "__main__":
    main()
	# Program to generate SMILES strings for short chain chlorinated paraffins (SCCP).

	# These are alkanes with the molecular formula C_{x} H_{2x-y+2} Cl_{y}

	# This implementation finds all unique ways to assign the `y`
	# chlorines into `x` bin such that no bin has more than 2 chlorines,
	# except the first and last may have 3 chlorines, then use those
	# counts to create a SMILES string.

	## Step 1: Assign chlorine counts for each carbon atom

	# The implementation is recursive.

	# For each of the up to 3 ways to assign chlorine counts to the first atom,
	# find all of the ways to assign counts for the remaining part of the chain.

	# For each of the up to 2 ways to assign chlorine counts to a non-termainal
	# chain atom, find the ways to assign counts for the remaining part.

	# The base case is either the terminal atom, or no remaining carbons.

	# This algorithm does not generate two identical bin assignments.

	## Step 2: Determine which counts are unique

	# The algorithm in step 1 with num_carbons = 3 and num_chlorines = 6
	# creates the following carbon counts:
	# A) (1, 2, 3)
	# B) (2, 1, 3)
	# C) (2, 2, 2)
	# D) (3, 0, 3)
	# E) (3, 1, 2)
	# F) (3, 2, 1)

	# The molecules are symmetrical, which means A) and F) describe the
	# same molecule, just starting from the other end. Same for B) and E).

	# The uniqueness algorithm first sorts the chlorine count assignments
	# (which ensures the output is in a well-defined order) then selects
	# each assignment in turn, so long as the flipped version has not
	# already been picked.

	## Step 3: Convert into a SMILES string

	# Given chlorine counts for each atom, like (3, 0, 1, 2) the
	# corresponding SMILES string can be created syntactically as
	# the terms
	# "C(Cl)(Cl)(Cl)" +
	# "C" +
	# "C(Cl)" +
	# "C(Cl)(Cl)"
	# to give:
	# "C(Cl)(Cl)(Cl)CC(Cl)C(Cl)(Cl)"

	# The hydrogen counts are determined by the implicit hydrogen count
	# rule of SMILES.

	# This can further (and optionally) be canonicalized with RDKit, giving:
	# "ClC(Cl)C(Cl)CC(Cl)(Cl)Cl"

	import sys
	import functools

	# In my tests, caching (or "memoizing") is about 33% faster
	if hasattr(functools, "cache"): # Added to Python 3.9
	cache = functools.cache
	else:
	cache = functools.lru_cache(maxsize=None)

	@cache
	def _sccp_chain(num_carbons, num_chlorines, max_chlorines_per_atom):
	# Figure out the base case, which stops the recursion
	if num_carbons == 1:
	return [(num_chlorines,)]
	if num_chlorines == 0:
	return [(0,) * num_carbons]

	# This is not terminal, so the max possible chlorine count is 2 or max_chlorines_per_atom
	max_num_chlorines_on_atom = min(num_chlorines, 2, max_chlorines_per_atom)
	# Can't let the surplus of chlorine atoms become too large
	min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1)

	chains = []

	# For each possible chlorine count assignment for this atom ...
	for num_chlorines_on_atom in range(
	min_num_chlorines_on_atom,
	max_num_chlorines_on_atom + 1):
	# .. recursively extend the chain
	for subchain in _sccp_chain(
	num_carbons - 1,
	num_chlorines - num_chlorines_on_atom,
	max_chlorines_per_atom,
	):
	chain = (num_chlorines_on_atom,) + subchain
	chains.append(chain)

	return chains

	def get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3):
	# Check for possible out-of-range conditions
	if num_carbons < 1:
	raise ValueError("Must have at least one carbon")
	if num_chlorines < 0:
	raise ValueError("May not have a negative number of chlorines")
	if (2*num_carbons - num_chlorines + 2) < 0:
	raise ValueError("Too many chlorines for the number of carbons")

	# Special case a single carbon (all chlorines on that atom)
	if num_carbons == 1:
	return [(num_chlorines,)]
	# Special case no chlorines
	if num_chlorines == 0:
	return [(0,) * num_carbons]

	# The first atom has a max possible chlorine count of 3 or max_chlorines_per_atom
	max_num_chorines_on_atom = min(num_chlorines, 3, max_chlorines_per_atom)
	# Can't let the surplus of chlorine atoms become too large
	min_num_chlorines_on_atom = max(0, num_chlorines - 2*num_carbons + 1)

	chains = []

	# For each possible chlorine count assignment for the first atom ...
	for num_chlorines_on_first_atom in range(
	min_num_chlorines_on_atom,
	max_num_chorines_on_atom + 1):
	# .. extend by the possible ways to make an sscp chain which
	# is one carbon shorter and without the chlorines assigned
	# to the first atom.
	for subchain in _sccp_chain(
	num_carbons - 1,
	num_chlorines - num_chlorines_on_first_atom,
	max_chlorines_per_atom):
	chain = (num_chlorines_on_first_atom,) + subchain
	chains.append(chain)

	return chains

	def get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom=3):
	nonunique_chains = get_nonunique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom)
	# Sort them so the highest chlorine counts are on the right.
	# (This is mostly to make the result be visually pleasing.)
	nonunique_chains.sort()

	unique_chains = []
	seen = set()
	for chain in nonunique_chains:
	# Ensure uniqueness, including symmetry
	if chain in seen:
	continue
	# Exclude its flipped version
	seen.add(chain[::-1])

	unique_chains.append(chain)

	return unique_chains

	def get_sccp_smiles(num_carbons, num_chlorines, max_chlorines_per_atom=3, canonicalize=True):
	if canonicalize:
	# Only need RDKit if we want to generate canonical SMILES.
	from rdkit.Chem import CanonSmiles

	smiles_list = []
	unique_chains = get_unique_sccp_chains(num_carbons, num_chlorines, max_chlorines_per_atom)
	for chain in unique_chains:
	smiles_terms = []
	for num_chlorines_on_atom in chain:
	smiles_terms.append("C" + "(Cl)" * num_chlorines_on_atom)
	smiles = "".join(smiles_terms)

	if canonicalize:
	smiles = CanonSmiles(smiles)

	smiles_list.append(smiles)

	if canonicalize:
	# Double-check the unique generation is actually unique
	assert len(smiles_list) == len(set(smiles_list))

	return smiles_list

	import argparse
	parser = argparse.ArgumentParser(
	description = "Generate SCCP SMILES given the number of carbons and chlorines",
	)
	parser.add_argument(
	"--min-C",
	"--C",
	metavar = "N",
	type = int,
	help = "The minimum number of carbons",
	)
	parser.add_argument(
	"--max-C",
	metavar = "N",
	type = int,
	help = "The maximum number of carbons",
	)
	parser.add_argument(
	"--min-Cl",
	"--Cl",
	metavar = "N",
	type = int,
	help = "The minimum number of chlorines",
	)
	parser.add_argument(
	"--max-Cl",
	metavar = "N",
	type = int,
	help = "The maximum number of chlorines",
	)
	parser.add_argument(
	"--max-Cl-per-atom",
	type = int,
	choices = (1, 2, 3, 4),
	default = 4,
	help = "maximum number of chlorines per atom",
	)

	parser.add_argument(
	"--min-content",
	"--content",
	metavar = "FLT",
	type = float,
	help = "The minimum chlorine content by mass, as a float",
	)
	parser.add_argument(
	"--max-content",
	metavar = "FLT",
	type = float,
	help = "The maximum chlorine content by mass, as a float",
	)
	parser.add_argument(
	"--no-canonicalize",
	action = "store_false",
	default = True,
	dest = "canonicalize",
	help = "Use --no-canonicalize to skip SMILES canonicalization",
	)

	def check_C_range(parser, min_C, max_C):
	if min_C is None:
	if max_C is None:
	sys.stderr.write("C range not specified. Using --min-C 10 and --max-C 10\n")
	min_C = max_C = 10
	elif max_C < 1:
	parser.error(f"--max-C must be positive")
	else:
	min_C = 1
	else:
	if min_C < 1:
	parser.error(f"--min-C must be positive")
	if max_C is None:
	max_C = min_C
	elif max_C < min_C:
	parser.error(f"--min-C must not be larger than --max-C")

	return min_C, max_C

	def check_Cl_range(parser, min_Cl, max_Cl, max_C):
	if min_Cl is None:
	if max_Cl is None:
	# Use the maximum possible range
	min_Cl = 1
	max_Cl = 2*max_C + 2
	elif max_Cl < 1:
	parser.error(f"--max-Cl must be positive")
	else:
	min_Cl = 1
	else:
	if min_Cl < 0:
	parser.error(f"--min-Cl must be non-negative")
	if max_Cl is None:
	max_Cl = min_Cl
	elif max_Cl < min_Cl:
	parser.error(f"--min-Cl must not be larger than --max-Cl")

	return min_Cl, max_Cl

	def check_content_range(parser, min_content, max_content):
	if min_content is None:
	if max_content is None:
	min_content, max_content = 0.4, 0.7
	sys.stderr.write(
	f"Content range not specified. Using --min-content {min_content} "
	f"and --max-content {max_content}.\n")
	else:
	min_content = 0.0
	sys.stderr.write(
	f"--min-content not specified. Using {min_content}.\n"
	)
	elif max_content is None:
	max_content = 1.0
	sys.stderr.write(
	f"--max-content not specified. Using {max_content}.\n"
	)

	if not (0.0 <= min_content <= 1.0):
	parser.error("--min-content must be betweeen 0.0 and 1.0, inclusive")
	if not (0.0 <= max_content <= 1.0):
	parser.error("--max-content must be betweeen 0.0 and 1.0, inclusive")
	if not (min_content <= max_content):
	parser.error("--min-content must not be larger than --max-content")

	return min_content, max_content

	WEIGHT_C = 12.011
	WEIGHT_H = 1.008
	WEIGHT_Cl = 35.45

	def get_chlorine_content(num_carbons, num_chlorines):
	num_hydrogens = 2*num_carbons - num_chlorines + 2
	mw_Cl = WEIGHT_Cl * num_chlorines
	mw_total = (WEIGHT_C * num_carbons + WEIGHT_H * num_hydrogens + mw_Cl)
	return mw_Cl / mw_total

	def main():
	args = parser.parse_args()
	min_C, max_C = check_C_range(parser, args.min_C, args.max_C)
	min_Cl, max_Cl = check_Cl_range(parser, args.min_Cl, args.max_Cl, max_C)
	min_content, max_content = check_content_range(parser, args.min_content, args.max_content)

	for num_carbons in range(min_C, max_C+1):
	for num_chlorines in range(min_Cl, max_Cl+1):
	if (2*num_carbons - num_chlorines + 2) < 0:
	continue
	content = get_chlorine_content(num_carbons, num_chlorines)
	if content > max_content:
	break
	if content < min_content:
	continue
	smiles_list = get_sccp_smiles(
	num_carbons,
	num_chlorines,
	args.max_Cl_per_atom,
	args.canonicalize,
	)
	for smiles in smiles_list:
	print(smiles)

	if __name__ == "__main__":
	main()