Created
August 9, 2016 10:39
-
-
Save klprint/936adc536f828011f1ea39346427b0a7 to your computer and use it in GitHub Desktop.
Searches in files with the same extension for sequences being provided by a fasta file. The script picks 15-16 base-pairs in the middle of each sequence in the fasta file and uses grep to find the number of occurrences.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import glob | |
# Functions | |
def read_multifasta(file_path): | |
is_entry = False | |
fasta_dict = {} | |
sequence = [] | |
key = None | |
with open(file_path, 'r') as fasta: | |
for line in fasta: | |
if line[0] == '>': | |
if is_entry: | |
fasta_dict[key] = ''.join(sequence) | |
sequence = [] | |
key = line.split(' ')[0][1:] | |
is_entry = True | |
else: | |
if line[0] != '\n': | |
sequence.append(line.rstrip()) | |
fasta_dict[key] = ''.join(sequence) | |
return(fasta_dict) | |
def subset_seq(sequence, q_len): | |
mid = int(len(sequence)/2) | |
start = mid - int(q_len/2) | |
end = mid + int(q_len/2) | |
sub_seq = sequence[start-1:end] | |
return(sub_seq) | |
def grep(subj_file, querry): | |
cmd = ['grep', '-c', querry, subj_file] | |
count = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0] | |
count = count.decode('utf-8') | |
count = int(count) | |
print(count) | |
return(count) | |
def main(): | |
ext = input('Extension of Read-Files (without \'.\'): ') | |
files = glob.glob('*.'+ext) | |
seqs = input('Path to querry fasta: ') | |
seqs = read_multifasta(seqs) | |
all_keys = len(seqs.keys()) | |
print(str(all_keys) + ' sequences loaded.') | |
querry_seqs = {} | |
for seqID in seqs.keys(): | |
full_seq = seqs[seqID] | |
sub_seq = subset_seq(full_seq, 15) | |
querry_seqs[seqID] = sub_seq | |
with open('countfile.out', 'w') as outfile: | |
outfile.write('ReadFile\t') | |
id_count = 0 | |
qIDs = querry_seqs.keys() # determines correct order | |
for id in qIDs: | |
id_count += 1 | |
outfile.write(id) | |
if id_count == len(qIDs): | |
outfile.write('\n') | |
else: | |
outfile.write('\t') | |
searched_keys = 1 | |
for file in files: | |
print('Started search for sequences in ' + file) | |
outfile.write(file + '\t') | |
for querryID in qIDs: | |
querry = querry_seqs[querryID] | |
print(querryID) | |
count = grep(file, querry=querry) | |
outfile.write(str(count) + '\t') | |
print(str((searched_keys/all_keys)*100) +'% searched') | |
searched_keys += 1 | |
searched_keys = 0 | |
outfile.write('\n') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment