Created
April 27, 2010 20:40
-
-
Save drio/381296 to your computer and use it in GitHub Desktop.
dolphin solexa slx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Generates a fasta file that is more managable for | |
# the illumina squasher. | |
# | |
fake_fname = "./fake_input.txt" | |
final_fname = "./final.fa" | |
n_splits = 3 | |
contig_separator = "N" * 50 + "\n" + "N" * 50 | |
# remove > + spaces and '\n' between contigs | |
puts "Finding number of lines of input file" | |
n_lines = `wc -l #{fake_fname}`.chomp.to_i | |
# Remove the original conting separators, add | |
# N padding to separate them (so we avoid reads | |
# mapping to sequence that belongs to two contings. | |
# And finally add more padding in case we have | |
# lanes that have < 50 bases | |
puts "Removing '>' ..." | |
`cat #{fake_fname} | | |
ruby -pe 'gsub(/^>.*$/, "#{contig_separator}")' | | |
ruby -ne 'foo=chomp; puts foo + ("N" * (50 - foo.chomp.size))' > ./out.txt` | |
#- split input in n splits... | |
puts "Splitting input file ..." | |
`split -l #{n_lines/n_splits} out.txt` | |
`rm -f out.txt` | |
#- append > at the beginng.. | |
puts "Appending '>' ..." | |
i = 1 | |
Dir["x*"].each { |xfile| | |
`echo ">fake#{i}" > #{xfile}.bak` | |
`cat #{xfile} >> #{xfile}.bak` | |
`rm -f #{xfile}` | |
`mv #{xfile}.bak #{xfile}` | |
i+=1 | |
} | |
#- cat them in one big file.. or not | |
puts "Merging all splits ..." | |
`rm -f #{final_fname}` | |
Dir["x*"].each { |xfile| | |
`cat #{xfile} >> #{final_fname}` | |
`rm -f #{xfile}` | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment