Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Neemox/a1286bce62ebab3f9ebd to your computer and use it in GitHub Desktop.
Save Neemox/a1286bce62ebab3f9ebd to your computer and use it in GitHub Desktop.
import re
import csv
re_degenerates = re.compile(r"[wsmkrybdhvn]")
def get_patterns(filename):
with open(filename) as f:
lines = f.readlines()[2:] # the first two lines aren't data
for record in csv.reader(lines):
index = record[0] # the first column is the pattern id
pattern = record[1] #this *should look at the second column in the CSV
# pattern = "".join(record[1:]) # the rest of the columns are nucleotides (This line was for separated nucleotides uin the CSV)
yield index, pattern
def has_degenerate(pattern):
return re_degenerates.search(pattern) is not None
def get_permutations(pattern):
""" Return all of the possible interpretation/permutations of pattern
pattern is a string that matches /[actgwsmkrybdhvn]+/
Usage:
>>> tuple(get_permutations("acww"))
('acaa', 'acat', 'acta', 'actt')
"""
_degeneracy = {
"w": ("a", "t"),
"s": ("c", "g"),
"m": ("a", "c"),
"k": ("g", "t"),
"r": ("a", "g"),
"y": ("c", "t"),
"b": ("c", "g", "t"),
"d": ("a", "g", "t"),
"h": ("a", "c", "t"),
"v": ("a", "c", "g"),
"n": ("a", "c", "g", "t"),
}
if has_degenerate(pattern):
for nuc in pattern:
if nuc in _degeneracy:
for resolved_nuc in _degeneracy[nuc]:
new_pattern = pattern.replace(nuc, resolved_nuc, 1)
yield from get_permutations(new_pattern)
break
else:
yield pattern
def find_sequence(sequence, degenerate_pattern):
""" Return true if sequence is in any of the permutations of degenerate_pattern.
sequence must consist only of real base pairs. (it must match /[actg]+/)
"""
re_sequence = re.compile(sequence)
for permutation in get_permutations(degenerate_pattern):
match = re_sequence.search(permutation)
if match:
print("Matched sequence '{0}' in permutation '{1}' at position {2}".format(sequence, swaprange(permutation, *match.span()), match.span()))
return False
def main(sequence):
for id, pattern in get_patterns("data.csv"):
print("Permuting pattern {0}".format(id))
if not find_sequence(sequence, pattern):
print("Found no matches")
print()
def swaprange(s, left, right):
return s[:left] + s[left:right].swapcase() + s[right:]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment