Created
August 27, 2022 11:26
-
-
Save conchoecia/dd858a46cd7d8dfb492570e4b0185f31 to your computer and use it in GitHub Desktop.
these functions take a DNA sequence and return the coordinates (python syntax) of the contigs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
myseq = "ACNNNNNNNNNNNNNCATGTACTTGGATCTATCGGTGATCGGATCGGTATGCGTACGAGTGTCAGTCNNNNNNNNNNNNNNNACGTGGTATGCGGCATGCGTAGCGTCAGCTAGCTGATATTGCGTAGCNNNNNNNNNNGGTATGCGTG" | |
def contig_ranges(seq, sub): | |
indices = list(find_all(seq, sub)) | |
gap_starts = [] | |
gap_stops = [] | |
gap_ranges = [] | |
prev = -1 | |
# get the starts of the gaps | |
for i in range(len(indices)): | |
if prev != indices[i] - 1: | |
gap_starts.append(indices[i]) | |
prev = indices[i] | |
# get the indices of the ends of the gaps | |
prev = 999999999999 | |
for i in range(len(indices) - 1, -1, -1): | |
if prev != indices[i] + 1: | |
gap_stops.append(indices[i]) | |
prev = indices[i] | |
gap_stops = [x + len(sub) for x in gap_stops[::-1]] | |
contigs = [] | |
# now get the ranges | |
for i in range(len(gap_stops)): | |
gap_ranges.append([gap_starts[i], gap_stops[i]]) | |
if i == 0: | |
contigs.append([0, gap_starts[i]]) | |
contigs.append([gap_stops[i], gap_starts[i+1]]) | |
elif i == (len(gap_stops) -1): | |
contigs.append([gap_stops[i], len(seq)]) | |
else: | |
contigs.append([gap_stops[i], gap_starts[i+1]]) | |
return contigs | |
def find_all(seq, sub): | |
start = 0 | |
indices = [] | |
while True: | |
start = seq.find(sub, start) | |
if start == -1: return | |
yield start | |
#start += len(sub) # use start += 1 to find overlapping matches | |
start += 1 # use start += 1 to find overlapping matches | |
return indices | |
# first argument is the sequence | |
# second sequence is the minimum number of Ns in a gap | |
ranges = contig_ranges(myseq, "NNNN") | |
print(ranges) | |
print(myseq) | |
for start, stop in ranges: | |
print(myseq) | |
print("{}{}".format("".join([" "] * start), myseq[start:stop])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment