Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Created August 5, 2013 10:33
Show Gist options
  • Save afrendeiro/6154959 to your computer and use it in GitHub Desktop.
Save afrendeiro/6154959 to your computer and use it in GitHub Desktop.
Blastx xml output parser
#!/usr/bin/awk -f
# Author: Laurent Manchon ([email protected])
# Split big blast output in xml format into severals files
# Type split_xml_blast without parameters to see usage.
BEGIN{
{
if (ARGC==3 && ARGV[1] !~ "^[a-zA-Z]+$")
{
# max is number of sequences per output file
max = ARGV[1]+0
ARGV[1]=""
} else
{
assert_exit = 1
usage()
}
}
cpt=nb=1
suffix=".xml"
end="</BlastOutput_iterations>\n</BlastOutput>"
begin="<?xml version=\"1.0\"?>\n<!DOCTYPE BlastOutput PUBLIC \"-//NCBI//NCBI BlastOutput/EN\" \"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd\">\n"
begin=begin "<BlastOutput>\n<BlastOutput_program>blastx</BlastOutput_program>\n<BlastOutput_version>blastx 2.2.18 [Mar-02-2008]</BlastOutput_version>\n"
begin=begin "<BlastOutput_reference></BlastOutput_reference>\n<BlastOutput_db>/home/data/blastdb/nr</BlastOutput_db>\n<BlastOutput_query-ID>lcl|1_0</BlastOutput_query-ID>\n"
begin=begin "<BlastOutput_query-def></BlastOutput_query-def>\n<BlastOutput_query-len></BlastOutput_query-len>\n<BlastOutput_param>\n<Parameters>\n<Parameters_matrix>BLOSUM62</Parameters_matrix>\n"
begin=begin "<Parameters_expect>0.1</Parameters_expect>\n<Parameters_gap-open>11</Parameters_gap-open>\n<Parameters_gap-extend>1</Parameters_gap-extend>\n<Parameters_filter>F</Parameters_filter>\n"
begin=begin "</Parameters>\n</BlastOutput_param>\n<BlastOutput_iterations>"
}
function usage()
{
print "###################################################################################"
print "# split_xml_blast -- split big blast output in xml format into severals files. #"
print "# Performed in Awk v3.1 A.V. Aho, P.J. Weinberger, and B.W. Kernighan #"
print "# OS supported: *nix, Windows9x/NT #"
print "###################################################################################"
print "# Author: Laurent Manchon #"
print "# If you have comments or questions, send to the author at: #"
print "# [email protected] #"
print "###################################################################################"
print "# #"
print "# This program takes a file containing blast result in XML format and split #"
print "# it into severals small files, as: split_xml_blast <nb> <input_filename> #"
print "# with <nb>: Number of sequences per output file #"
print "# #"
print "###################################################################################"
exit 1
}
/<Iteration>/{
split(FILENAME,prefix,".")
file=prefix[1] "_"
output_file=file nb suffix
i=1
if(cpt==1){print begin >> output_file}
print $0 >> output_file
next
}
i==1{print $0 >> output_file}
/<\/Iteration>/{
cpt++
if(cpt==max+1){
print end >> output_file
close (output_file)
nb++
cpt=1
i=0
next
}
}
END {
if (assert_exit) exit 1
print "\nYour input file",FILENAME,"has just been splitted into",nb,"files with",max,"sequences per file:\n"
cmd="ls -1 "file"*.xml"
system(cmd)
close(cmd)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment