Created
February 13, 2019 14:29
-
-
Save ccwang002/c6fca09bf010bbc20060c194a56697b7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import combinations, product | |
def gen_pos_sets_to_sub(barcode, max_sub=1): | |
""" | |
Generate all the possible position combinations (sets) within | |
the given maximal number of substitutions. | |
Examples: | |
>>> list(gen_pos_sets_to_sub('ATG', 1)) | |
[(0,), (1,), (2,)] | |
>>> list(gen_pos_sets_to_sub('ATG', 2)) | |
[(0,), (1,), (2,), (0, 1), (0, 2), (1, 2)] | |
""" | |
assert max_sub >= 0 | |
all_pos = range(len(barcode)) | |
for num_sub in range(1, max_sub + 1): | |
yield from combinations(all_pos, num_sub) | |
def sub_barcode(barcode, pos_set): | |
""" | |
Generate all possible barcode sequences with the substitutions | |
at the given set of indices. | |
Examples: | |
>>> list(sub_barcode('AT', (0,))) | |
['TT', 'CT', 'GT'] | |
>>> list(sub_barcode('AT', (0, 1))) | |
['TA', 'TC', 'TG', 'CA', 'CC', 'CG', 'GA', 'GC', 'GG'] | |
""" | |
# Convert the barcode string into a list of nts | |
barcode = list(barcode) | |
# Make the possible nucleotide per position | |
possible_nts_per_pos = [ | |
# Exclude the original nucleotide | |
[nt for nt in 'ATCG' if nt != barcode[pos]] | |
for pos in pos_set | |
] | |
# The first for loop assign the new nucleotide | |
# for each position to be replaced | |
for new_nts in product(*possible_nts_per_pos): | |
# Copy the original barcode (list of nts) | |
barcode_with_sub = barcode[:] | |
# Replace the barcode sequence with the new nucleotides | |
# at the given corresponding positions | |
for ix, nt in zip(pos_set, new_nts): | |
barcode_with_sub[ix] = nt | |
# Combine the new barcode into a single string | |
yield ''.join(barcode_with_sub) | |
def gen_all_barcodes_with_sub(barcode, max_sub=1): | |
""" | |
Generate all possible barcode sequences within the given number of | |
substitutions. | |
Examples: | |
>>> list(gen_all_barcodes_with_sub('AT', 1)) | |
['TT', 'CT', 'GT', 'AA', 'AC', 'AG'] | |
A more realistic example using a longer barcode: | |
>>> barcode = 'ATATCGATCG'; len(barcode) | |
10 | |
Count the number of all possible barcodes with <= 2 substitutions | |
>>> len(list(gen_all_barcodes_with_sub(barcode, 2))) | |
435 | |
Calculate the theoretical number of barcodes | |
>>> 10 * (10 - 1) // 2 * ((4 - 1)**2) + len(barcode) * (4 - 1) | |
435 | |
>>> len(list(gen_all_barcodes_with_sub(barcode, len(barcode)))) == 4 ** len(barcode) - 1 | |
True | |
""" | |
barcode = barcode.upper() | |
assert all(nt in 'ATCG' for nt in barcode) | |
for pos_set in gen_pos_sets_to_sub(barcode, max_sub): | |
yield from sub_barcode(barcode, pos_set) | |
if __name__ == '__main__': | |
# Run all the examples | |
import doctest | |
doctest.testmod(verbose=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment