Last active
December 17, 2015 21:59
-
-
Save blahah/5678470 to your computer and use it in GitHub Desktop.
Snippet to extract consensus joined sequences from Sanger sequencing with forward and reverse primers (parsing data in the format provided by Source Bioscience), and some extra code to reconstruct the original fragment sequences for this specific case (virus protein sequencing with forward and revers primers from two overlapping fragments from e…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'bio' | |
# load sequences, reverse-complement-align the pairs | |
seqs = {} | |
# get all .seq files | |
Dir.chdir('../raw') do | |
Dir['*.seq'].each do |file| | |
# load as fasta format | |
Bio::FastaFormat.open(file).each do |f| | |
# grab each fasta entry and extract metadata from id | |
virus, gene, orientation = f.entry_id.scan(/[0-9]+_([0-9]+)_([0-9][Ab])([FR])_.+/).first | |
seqs[virus] = {} if !seqs.has_key? virus | |
seqs[virus][gene] = {} if !seqs[virus].has_key? gene | |
# store in hash | |
seqs[virus][gene][orientation] = f.to_seq | |
end | |
end | |
end | |
def stripN(seq) | |
# this is ugly - a recursive end-search would be much prettier | |
seq = seq.gsub(/^[atgc]{0,3}n+[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*/, '') | |
seq = seq.gsub(/n+[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}n*[atgc]{0,3}$/, '') | |
return seq | |
end | |
def checkSequence(seq, cutoff) | |
# return true if sequence < cutoff% N | |
c = seq.composition | |
c = c['n'] / c.values.reduce(:+).to_f | |
if c < cutoff | |
return true | |
else | |
return false | |
end | |
end | |
# store final sequences | |
final4a = {} | |
final2b = {} | |
def storeSeq(key, seq, hash) | |
hash[key] = seq | |
end | |
# for each virus | |
seqs.each_pair do |virus, vdata| | |
# for each gene | |
vdata.each_pair do |gene, gdata| | |
output = gene == '4A' ? final4a : final2b | |
# get F and R | |
f, r = gdata['F'], gdata['R'] | |
f.na; r.na | |
f, r = stripN(f), stripN(r).complement | |
keepf, keepr = checkSequence(f, 0.2), checkSequence(r, 0.2) | |
if keepf && keepr | |
# both sequences good | |
# perform alignment of the two ends | |
clustal = Bio::ClustalW.new | |
aln = clustal.query_alignment([f, r.complement]).alignment | |
seq = Bio::Sequence::NA.new(aln.consensus_string(threshold=0.5, :gap_mode=>-1)) | |
storeSeq(virus, seq, output) | |
elsif keepf | |
puts "reverse #{virus}:#{gene} sequence was low quality" | |
storeSeq(virus, f, output) | |
elsif keepr | |
puts "forward #{virus}:#{gene} sequence was low quality" | |
storeSeq(virus, r, output) | |
else | |
puts "both #{virus}:#{gene} sequences were low quality" | |
end | |
end | |
end | |
File.open('../joined_sequences/joined.fa', 'w') do |out| | |
final4a.each_pair do |virus, foura| | |
twob = final2b[virus] | |
clustal = Bio::ClustalW.new | |
aln = clustal.query_alignment([twob, foura]).alignment | |
seq = Bio::Sequence::NA.new(aln.consensus_string(threshold=0.5, :gap_mode=>-1)) | |
out.puts seq.to_fasta(virus, 60) | |
end | |
end | |
File.open('../joined_sequences/4a.fa', 'w') do |out| | |
final4a.each_pair do |virus, seq| | |
out.puts seq.to_fasta(virus, 60) | |
end | |
end | |
File.open('../joined_sequences/2b.fa', 'w') do |out| | |
final2b.each_pair do |virus, seq| | |
out.puts seq.to_fasta(virus, 60) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment