Created
December 17, 2019 12:55
-
-
Save philippbayer/2a7bc2a8275086945d028a80abaf3341 to your computer and use it in GitHub Desktop.
trims fastas of several sequences/genes using a single known sequence/gene
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import subprocess | |
| from Bio.Seq import Seq | |
| from Bio import SeqIO | |
| trimal = '/path/to/trimal' | |
| muscle = '/path/to/muscle3.8.31_i86linux64' | |
| # this is the fasta containing the single reference gene we use for trimming all of our seqeunces | |
| ref_gene = '/path/to/single/ref.fasta' | |
| all_out = open('file_to_fix.Trimmed.fasta', 'w') | |
| # this is the fasta containing all the sequences we want to trim | |
| for s in SeqIO.parse('file_to_fix.fasta', 'fasta'): | |
| # concatenate with single gene, align using muscle, trim beguinning and endnig gaps with trimAl | |
| with open('Both.fasta', 'w') as out: | |
| out.write(s.format('fasta')) | |
| cmd0 = 'cat %s >> Both.fasta'%ref_gene | |
| p = subprocess.Popen(cmd0, stdout=subprocess.PIPE, shell=True) | |
| (output, err) = p.communicate() | |
| p_status = p.wait() | |
| cmd1 = '%s -in Both.fasta -out Result.fasta'%muscle | |
| p = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) | |
| (output, err) = p.communicate() | |
| p_status = p.wait() | |
| cmd2 = '%s -in Result.fasta -nogaps -keepheader -terminalonly > Result_Trimmed.fasta'%trimal | |
| p = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) | |
| (output, err) = p.communicate() | |
| p_status = p.wait() | |
| x = [y for y in SeqIO.parse('Result_Trimmed.fasta', 'fasta')] | |
| x = x[0] | |
| x.seq = Seq(str(x.seq).replace("-", "")) | |
| all_out.write(x.format('fasta')) | |
| all_out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment