-
-
Save sayan01/af701ccacc115598cd6aaba71c16fef7 to your computer and use it in GitHub Desktop.
# first arg = database csv | |
# second arg = sequence | |
import csv | |
from sys import argv, exit | |
# if user does not adhere to usage, quit | |
if len(argv) != 3: | |
print("Usage: python dna.py data.csv sequence.txt") | |
exit(1) | |
# open the passed files and close them after work done | |
with open(argv[1]) as dataf, open(argv[2]) as seqf: | |
datao = csv.reader(dataf) # open database as csv object | |
seq = seqf.read() # read entire sequence into one string | |
data = [] # two dimensional array to store database | |
for row in datao: # add all the rows of database to 2D array | |
data.append(row) | |
header = data[0:1][0] # copy the first row (column headings) to 'header' | |
data = data[1:] # remove header from data itself | |
for row in data: # iterate over each row of data | |
this = True # boolean flag where true = this person is match | |
for i in range(1, len(row)): # for all the STR mentioned in database, check current person with seq | |
this = this and header[i]*int(row[i]) in seq and header[i]*(int(row[i])+1) not in seq | |
if this: # if all the STR matches for this person, print and exit | |
print(row[0]) | |
exit(0) | |
print("No match") # if no match then we come here, so print no match |
for i in range(1, len(row)): # for all the STR mentioned in database, check current person with seq
this = this and header[i]int(row[i]) in seq and header[i](int(row[i])+1) not in seqhey man wld u mind explaining this line of code for me? im new to python and am confused. thanks!
@averageandyyy
the for loop iterates from 1 to length of row -1 (we skip 0 as 0th column is name of person, not number of repetition)
in each iteration of this loop, we get a number in row[i]. This number is the number of times the header[i] string should be present in seq (the sequence entered) not more not less. Before starting the loop we have set a variable this = True. this variable is flag whether this current person (row) matches the sequence. Inside the for loop we are checking whether each of the patterns are present in seq or not.
this = this and ....
makes sure that 'this' is evaluated true at the end of the loop only if all the patterns in a row match.
header[i] is the pattern we want to check if it exists row[i] times or not. in python if we do "ab"*2
we get "abab"
so header[i]*row[i] gets us the exact pattern we want to be present in the sequence. In python in
is a operator that checks whether something is present in a array or string or not. header[i]*row[i] in seq
means that the exact pattern we want is a substring of seq. But this will also be true if the pattern is repeated more than row[i] times. like if we check "abab" in "abababc"
it will evaluate to true, which we dont want. we want that the pattern will be present EXACTLY row[i] times, so we also check that the pattern is NOT present row[i]+1 times. logically it means that if pattern is present atleast row[i] times but not present atleast row[i]+1 times then it is present EXACTLY row[i] times. we and
both of these conditionals as we want that both of them be true (the pattern appears EXACTLY row[i] times, neither less nor more). We also and
it with this
itself as we want this
to be true if and only if ALL the patterns are matched (all the header[i] are present row[i] times), only then does it mean that the current person (row
) is a valid match. If at the end of the loop this=True then we know all the patterns are matched and this DNA is a match, so we print his name (row[0]
) and stop the program
DAMN!!
for i in range(1, len(row)): # for all the STR mentioned in database, check current person with seq
this = this and header[i]int(row[i]) in seq and header[i](int(row[i])+1) not in seq
hey man wld u mind explaining this line of code for me? im new to python and am confused. thanks!