Created
November 29, 2012 17:22
-
-
Save radaniba/4170540 to your computer and use it in GitHub Desktop.
Some times you need sequences that are unambiguous (i.e. only 'ACGT', lacking gaps). This script reads in a sequence file of any format, reports the location and nature of ambiguous characters, and optionally corrects these from a consensus sequence and s
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
checkseqs [options] INFILE [INFILE, INFILE ...] | |
where options are: | |
--repair-with-conc: patch ambiguous characters with the consensus sequence and save | |
--overwrite: newly created files can write over pre-existing ones | |
Consensus is calculated on a 50% threshold. | |
#!/usr/bin/env ruby | |
# Check alignment for gaps or ambiguous characters and possibly fix them. | |
### IMPORTS | |
require 'bio' | |
require 'test/unit/assertions' | |
require 'optparse' | |
require 'pp' | |
include Test::Unit::Assertions | |
### IMPLEMENTATION | |
def each_match (str, re, &block) | |
str.scan(re) { | |
block.call($~) | |
} | |
end | |
# Check a sequence for unclear characters | |
# | |
def check_seq(seq) | |
seq_str = seq.to_str.downcase() | |
puts "= Checking #{seq.entry_id} ..." | |
"Length: #{seq.length}" | |
#illegal_chars = seqdata.split(//).uniq() - %w[a c g t A C G T] | |
#if (0 < illegal_chars.length) | |
# puts "#{title} has ambigs or gaps '#{illegal_chars.join()}' ..." | |
#end | |
probs = [] | |
each_match(seq_str, /[^\-acgt]+/) { |m| probs << m } | |
if (0 < probs.length) | |
puts "= #{seq.entry_id} has has ambigs or gaps ..." | |
probs.each { |m| | |
puts "- '#{m[0]}' at #{m.offset(0)[0]}-#{m.offset(0)[1]}" | |
} | |
end | |
return probs | |
end | |
### MAIN | |
# Parse commandline arguments. | |
# | |
def parse_clargs(arg_arr) | |
clopts = { | |
:repair_with_conc => false, | |
:overwrite => false, | |
} | |
OptionParser.new { |opts| | |
opts.banner = "Usage: checkseqs.rb [options] <seqfile1> [seqfile2] ..." | |
opts.on('-h', '--help', 'Display this screen') { | |
puts opts | |
exit | |
} | |
opts.on('', '--repair-with-conc', | |
"'Repair' any ambiguous sites with the consensus sequence and save results") { | |
clopts[:repair_with_conc] = true | |
} | |
opts.on('', '--overwrite', | |
"If saving output, overwrite pre-existing files") { | |
clopts[:overwrite] = true | |
} | |
#opts.on("-v", "--[no-]verbose", "Run verbosely") { |v| | |
# options[:verbose] = v | |
#} | |
opts.parse!() | |
} | |
pargs = ARGV | |
assert(0 < pargs.length, "need at least one file to work on") | |
return clopts, pargs | |
end | |
# Main script functionality. | |
# | |
def main() | |
clopts, infiles = parse_clargs(ARGV) | |
infiles.each { |f| | |
puts "== Checking file #{f} ..." | |
# gather and report sequence problems | |
all_seqs = [] | |
seq_probs = {} | |
Bio::FlatFile.auto(f) { |rec| | |
rec.each { |entry| | |
seq = entry.to_seq() | |
all_seqs << seq | |
probs = check_seq(seq) | |
if (0 < probs.length) | |
seq_probs[seq] = probs | |
end | |
} | |
} | |
# correct seq if requested | |
if (clopts[:repair_with_conc]) | |
# generate conc | |
aln = Bio::Alignment.new(all_seqs) | |
conc = aln.consensus_string(0.5, :gap_mode=>-1) | |
# repair seqs | |
all_seqs.each { |s| | |
probs = seq_probs.fetch(s, nil) | |
if probs | |
probs.each { |m| | |
offset = m.offset(0) | |
s[offset[0]...offset[1]] = conc[offset[0]...offset[1]] | |
} | |
end | |
} | |
outpath = File.basename(f, File.extname(f)) + ".clean.fasta" | |
if clopts[:overwrite] | |
assert(! File.exists?(outpath)) | |
end | |
hndl = File.open(outpath, 'wb') | |
all_seqs.each { |s| | |
hndl.write(s.output()) | |
} | |
hndl.close() | |
end | |
} | |
puts "Finished." | |
end | |
if $0 == __FILE__ | |
main() | |
end | |
### END |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment