Skip to content

Instantly share code, notes, and snippets.

@blahah
Created July 29, 2014 22:30
Show Gist options
  • Save blahah/7647b6b8ab908eb7f064 to your computer and use it in GitHub Desktop.
Save blahah/7647b6b8ab908eb7f064 to your computer and use it in GitHub Desktop.
Trimmomatic wrapper script
#!/usr/bin/env ruby
#
# trim-batch
#
# Take in a set of fastq files and run trimmomatic on each
#
# You MUST specify the location of the trimmomatic JAR
#
# Richard Smith-Unna ([email protected])
#
require 'rubygems'
require 'trollop'
require 'csv'
TRIMPREFIX = 't.'
UNPAIREDPREFIX = 'u.'
opts = Trollop::options do
version "v0.0.1a"
banner <<-EOS
trim-batch: run trimmomatic on multiple fastq files.
Single-end and/or paired-end can be included.
Trimmed files are outputted as '#{TRIMPREFIX}<input filename>'.
Paired reads whose pair is discarded are outputted as '#{TRIMPREFIX}#{UNPAIREDPREFIX}<input filename>'.
Log will be printed to <data><time>.trim.csv
Make sure Trimmomatic is installed
EOS
opt :pairedfile, "A file of paired-end FASTQ file paths, 1 per line, paired files in F, R order", :type => String
opt :paired, "A list of colon separated paired input FASTQ file paths, paired files in F, R order", :type => String
opt :singlefile, "A file of single-end FASTQ file paths, 1 per line", :type => String
opt :single, "A list of colon separated single-end input FASTQ file paths", :type => String
opt :jar, "Location of the trimmomatic jar file", :required => true, :type => String
opt :adapters, "Path to adapter FASTA file. If provided, adapters will be trimmed", :type => String
opt :phred, "Quality encoding. Either 33 or 64", :type => :int, :required => true
opt :leading, "Minimum quality required to keep a leading base", :default => 15, :type => Integer
opt :trailing, "Minimum quality required to keep a trailing base", :default => 15, :type => Integer
opt :windowsize, "Size of sliding window across which to average quality", :default => 4, :type => Integer
opt :quality, "Quality cutoff to use in sliding window trimming", :default => 15, :type => Integer
opt :minlen, "Minimum length of reads (any shorter than this after trimming are discarded)", :default => 60, :type => Integer
opt :threads, "How many threads to use for trimmomatic", :default => 1, :type => :int
opt :cleanup, "Remove input files after they are processed"
opt :output, "Directory the output files go to", :type => String
opt :test, "Don't actually run anything"
opt :verbose, "Be verbose"
end
Trollop::die :phred, "must be 33 or 64" if (opts[:phred]!=33 and opts[:phred]!=64) if opts[:phred]
t0 = Time.now
pairedlist, singlelist = [], []
# check inputs
if (opts.pairedfile && opts.paired) || (opts.singlefile && opts.single)
abort "Choose either file or list input but not both"
end
# check list file and load if OK
def check_list_file(file, outlist)
unless File.exists?(file)
raise "Can't find file \"#{opts.input}\""
end
File.open(file, "r").each_line do |line|
unless line.nil?
outlist << line.chomp
end
end
end
check_list_file(opts.pairedfile, pairedlist) if opts.pairedfile
check_list_file(opts.singlefile, singlelist) if opts.singlefile
# check list and load if OK
def check_list(inlist, outlist)
p inlist, outlist
outlist += inlist.split(":")
p outlist
outlist.map! { |file| File.expand_path(file) }
outlist.each do |file|
unless File.exists?(file)
raise "Can't find file \"#{file}\""
end
end
end
check_list(opts.paired, pairedlist) if opts.paired
check_list(opts.single, singlelist) if opts.single
phred = "-phred#{opts.phred}"
# build command(s)
pairedcmd, singlecmd = nil, nil
if opts.paired || opts.pairedfile
pairedcmd = "java -jar #{opts.jar} PE #{phred} -threads #{opts.threads} INFILEF INFILER OUTFILEF OUTFILEFU OUTFILER OUTFILERU"
pairedcmd += " ILLUMINACLIP:#{opts.adapters}:2:40:15" if opts.adapters
pairedcmd += " LEADING:#{opts.leading} TRAILING:#{opts.trailing} SLIDINGWINDOW:#{opts.windowsize}:#{opts.quality} MINLEN:#{opts.minlen}"
end
if opts.single || opts.singlefile
singlecmd = "java -jar #{opts.jar} SE #{phred} -threads #{opts.threads} INFILE OUTFILE"
singlecmd += " ILLUMINACLIP:#{opts.adapters}:2:40:15" if opts.adapters
singlecmd += " LEADING:#{opts.leading} TRAILING:#{opts.trailing} SLIDINGWINDOW:#{opts.windowsize}:#{opts.quality} MINLEN:#{opts.minlen}"
end
paired_trimlog = []
unpaired_trimlog = []
inpathf=""
# trim
pairedlist.each_slice(2) do |infilef, infiler|
cmd = pairedcmd
cmd = cmd.gsub(/INFILEF/, infilef)
cmd = cmd.gsub(/INFILER/, infiler)
if opts.output
inpathf = opts.output
inpathr = opts.output
infilef = File.basename(infilef)
infiler = File.basename(infiler)
else
inpathf = File.dirname(infilef)
infilef = File.basename(infilef)
inpathr = File.dirname(infiler)
infiler = File.basename(infiler)
end
cmd = cmd.gsub(/OUTFILEF/, "#{inpathf}/#{TRIMPREFIX}#{infilef}")
cmd = cmd.gsub(/OUTFILEFU/, "#{inpathf}/#{TRIMPREFIX}#{UNPAIREDPREFIX}#{infilef}")
cmd = cmd.gsub(/OUTFILER/, "#{inpathr}/#{TRIMPREFIX}#{infiler}")
cmd = cmd.gsub(/OUTFILERU/, "#{inpathr}/#{TRIMPREFIX}#{UNPAIREDPREFIX}#{infiler}")
puts "trimming #{infilef} and #{infiler}"
ret = `#{cmd} 2>&1` if !opts.test
puts cmd if opts.verbose
if !opts.test
ret.split('\n').each do |line|
next unless line =~ /^Input/
data = /Input Read Pairs: (?<input_reads>\d+) Both Surviving: (?<both_kept>\d+) \((?<both_kept_pc>[^\)]+)\) Forward Only Surviving: (?<fwd_kept>\d+) \((?<fwd_kept_pc>[^\)]+)\) Reverse Only Surviving: (?<rev_kept>\d+) \((?<rev_kept_pc>[^\)]+)\) Dropped: (?<dropped>\d+) \((?<dropped_pc>[^\)]+)\)/.match(line)
logline = Hash[data.names.zip(data.captures)]
logline['file'] = infilef
paired_trimlog << logline
end
end
if opts.cleanup
File.delete infilef
File.delete infiler
end
end
singlelist.each do |infile|
cmd = singlecmd
cmd = cmd.gsub(/INFILE/, infile)
inpath = File.dirname(infile)
infile = File.basename(infile)
cmd = cmd.gsub(/OUTFILE/, "#{inpath}/#{TRIMPREFIX}#{infile}")
puts "trimming #{infile}"
ret = `#{cmd} 2>&1`
p ret
ret.split(/\n/).each do |line|
p line
next unless line =~ /^Input/
data = /Input Reads: (?<input_reads>\d+) Surviving: (?<kept>\d+) \((?<kept_pc>[^\)]+)\) Dropped: (?<dropped>\d+) \((?<dropped_pc>[^\)]+)\)/.match(line)
logline = Hash[data.names.zip(data.captures)]
logline['file'] = infile
unpaired_trimlog << logline
end
# File.delete infile if opts.cleanup
end
datestr = Time.now.strftime('%d_%m_%Y_%H_%M_%S')
protfile = "trim.#{datestr}.protocol"
puts "Saving protocol to #{protfile}"
File.open(protfile, 'w') {|io| io.puts opts }
logsuffix = "#{datestr}.trim.csv"
def writelog(logarr, logfile)
return if logarr.length == 0
header = logarr.first.keys
CSV.open(logfile, 'w') do |log|
log << header
logarr.each do |line|
log << header.map { |h| line[h] }
end
end
end
writelog(paired_trimlog, "paired.#{logsuffix}")
writelog(unpaired_trimlog, "unpaired.#{logsuffix}")
puts "Done! Trimmed #{pairedlist.length + singlelist.length} files in #{Time.now - t0} seconds"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment