Created
February 21, 2013 16:18
-
-
Save radaniba/5005864 to your computer and use it in GitHub Desktop.
A number of programs won't handle sequences with gaps or ambiguous characters. This then is a script that reports those problems (e.g. telling you the problem is at residue x in sequence y)and can optionally patch it with the consensus sequence at that location.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Check alignment for gaps or ambiguous characters and possibly fix them. | |
### IMPORTS | |
require 'bio' | |
require 'test/unit/assertions' | |
require 'optparse' | |
require 'pp' | |
include Test::Unit::Assertions | |
### IMPLEMENTATION | |
def each_match (str, re, &block) | |
str.scan(re) { | |
block.call($~) | |
} | |
end | |
# Check a sequence for unclear characters | |
# | |
def check_seq(seq) | |
seq_str = seq.to_str.downcase() | |
puts "= Checking #{seq.entry_id} ..." | |
"Length: #{seq.length}" | |
#illegal_chars = seqdata.split(//).uniq() - %w[a c g t A C G T] | |
#if (0 < illegal_chars.length) | |
# puts "#{title} has ambigs or gaps '#{illegal_chars.join()}' ..." | |
#end | |
probs = [] | |
each_match(seq_str, /[^\-acgt]+/) { |m| probs << m } | |
if (0 < probs.length) | |
puts "= #{seq.entry_id} has has ambigs or gaps ..." | |
probs.each { |m| | |
puts "- '#{m[0]}' at #{m.offset(0)[0]}-#{m.offset(0)[1]}" | |
} | |
end | |
return probs | |
end | |
### MAIN | |
# Parse commandline arguments. | |
# | |
def parse_clargs(arg_arr) | |
clopts = { | |
:repair_with_conc => false, | |
:overwrite => false, | |
} | |
OptionParser.new { |opts| | |
opts.banner = "Usage: checkseqs.rb [options] <seqfile1> [seqfile2] ..." | |
opts.on('-h', '--help', 'Display this screen') { | |
puts opts | |
exit | |
} | |
opts.on('', '--repair-with-conc', | |
"'Repair' any ambiguous sites with the consensus sequence and save results") { | |
clopts[:repair_with_conc] = true | |
} | |
opts.on('', '--overwrite', | |
"If saving output, overwrite pre-existing files") { | |
clopts[:overwrite] = true | |
} | |
#opts.on("-v", "--[no-]verbose", "Run verbosely") { |v| | |
# options[:verbose] = v | |
#} | |
opts.parse!() | |
} | |
pargs = ARGV | |
assert(0 < pargs.length, "need at least one file to work on") | |
return clopts, pargs | |
end | |
# Main script functionality. | |
# | |
def main() | |
clopts, infiles = parse_clargs(ARGV) | |
infiles.each { |f| | |
puts "== Checking file #{f} ..." | |
# gather and report sequence problems | |
all_seqs = [] | |
seq_probs = {} | |
Bio::FlatFile.auto(f) { |rec| | |
rec.each { |entry| | |
seq = entry.to_seq() | |
all_seqs << seq | |
probs = check_seq(seq) | |
if (0 < probs.length) | |
seq_probs[seq] = probs | |
end | |
} | |
} | |
# correct seq if requested | |
if (clopts[:repair_with_conc]) | |
# generate conc | |
aln = Bio::Alignment.new(all_seqs) | |
conc = aln.consensus_string(0.5, :gap_mode=>-1) | |
# repair seqs | |
all_seqs.each { |s| | |
probs = seq_probs.fetch(s, nil) | |
if probs | |
probs.each { |m| | |
offset = m.offset(0) | |
s[offset[0]...offset[1]] = conc[offset[0]...offset[1]] | |
} | |
end | |
} | |
outpath = File.basename(f, File.extname(f)) + ".clean.fasta" | |
if clopts[:overwrite] | |
assert(! File.exists?(outpath)) | |
end | |
hndl = File.open(outpath, 'wb') | |
all_seqs.each { |s| | |
hndl.write(s.output()) | |
} | |
hndl.close() | |
end | |
} | |
puts "Finished." | |
end | |
if $0 == __FILE__ | |
main() | |
end | |
### END |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment