Skip to content

Instantly share code, notes, and snippets.

@IdoBar
Last active May 16, 2017 01:26
Show Gist options
  • Save IdoBar/bad7056e6bc3e9b06ac2f982439372ca to your computer and use it in GitHub Desktop.
Save IdoBar/bad7056e6bc3e9b06ac2f982439372ca to your computer and use it in GitHub Desktop.
An updated version of the original split_blast_with_GNU-Parallel.bash script, accepting command line argumens, to allow using it from within scripts and automate in pipelines
#!/bin/bash
_usage() {
cat <<EOF
$*
Usage: $0 <[options]>
Options:
-i --in input fasta file. If not specified will defaults to stdin [-]
-o --out output combined results file. If not specified will defaults to stdout [-]
-p --parts how many parts to break to input fasta into [default:5000]
-j --jobs how many parallel jobs to run
-c --cmd the original command to run, REQUIRED AND MUST BE QUOTED!! (escape internal quotes with \" if needed)
-k --keep switch to keep the temporary folder used to split and process the input file [default:false]
-v --verbose verbose level: 0 - none, 1 - print messages, 2 - print messages and commands (not suitable in scripts when the output needs to be piped) [default:1]
-h --help print this help message
EOF
}
if [ $# = 0 ]; then _usage " >>>>>>>> no options given "; fi
# [ $# -ge 1 -a -f "$1" ] && input="$1" || input="-"
# Define these according to the run
# For HMMER
# NOTE: This requires GNU getopt. On Mac OS X and FreeBSD, you have to install this
# separately; see below.
if ! TEMP=`getopt -o i:o:p:j:c:kv:h --long in:,out:,parts:,jobs:,cmd:,keep,verbose:,help -n 'parallel_split_blast' -- "$@"`
then
# something went wrong, getopt will put out an error message for us
_usage >&2
exit 1
fi
# if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
# Note the quotes around `$TEMP': they are essential!
eval set -- "$TEMP"
VERBOSE=1
JOBS=32
PARTS=5000
WORKDIR=`pwd`
# Use wildcards or specify an exact input fasta filename below (should be in your current working directory)
INPUT="-"
OUTFILE="-"
DATE=`date +%d_%m_%Y`
KEEP=false
# CMD='blastp -db nr -outfmt \"6 std stitle staxids sscinames sskingdom\" -max_target_seqs 1 -max_hsps 1 -evalue 1e-10'
#CMD=`printf "hmmscan --cpu %s --domtblout \%s/\%s.pfam.domtblout ~/.hmmer-3.1/Pfam/Pfam-A.hmm \%s > \%s/\%s_pfam.log",$NCPUS`
# update PREFIX to your liking (a nice, short descriptive name of your split analysis)
TMPLOC=$( mktemp -d --tmpdir=./ ) # to create a temporary folder in the current path
PREFIX="parallel_split_"$DATE
# For blast, set
SUFFIX="outfmt6"
CMD=false
while true; do
case "$1" in
-i | --in ) INPUT="$2" ; shift 2 ;;
-o | --out ) OUTFILE="$2" ; shift 2 ;;
-p | --parts ) PARTS="$2" ; shift 2 ;;
-j | --jobs ) JOBS="$2" ; shift 2 ;;
-c | --cmd ) CMD="$2" ; shift 2 ;;
-k | --keep ) KEEP=true ; shift ;;
-v | --verbose ) VERBOSE="$2" ; shift 2 ;;
-h | --help ) _usage ; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
# Convert verbose to numeric
VERBOSE=$(( VERBOSE + 1 - 1 ))
# Check valid input (file or stdin):
if ! [[ -e "$INPUT" || "$INPUT" = "-" ]]; then
_usage " >>>>>>>> no valid input file given ($INPUT) "
exit 1
fi
if [[ "$INPUT" = "-" ]]; then
INPUT_FASTA="$PREFIX.fasta"
cat "$INPUT" > "$INPUT_FASTA"
else
INPUT_FASTA="$INPUT"
fi
# Check mandatory parameters:
if [[ "$CMD" = false ]]; then
_usage " >>>>>>>> no valid command given "
exit 1
fi
# verbose - print commands
if [[ $VERBOSE = 2 ]]; then set -v; fi
#SUFFIX="pfam.domtblout"
# mkdir $PREFIX 2>/dev/null
cd $TMPLOC # $PREFIX
cat ../$INPUT_FASTA | parallel --gnu -kN$PARTS --no-run-if-empty --recstart '>' --pipe "cat > $INPUT_FASTA.part{#}"
# remove empty files:
# find . -size 0 -name "$INPUT_FASTA.part*" | xargs rm
# Add a trailing '0' to parts 1-9 (to be able to sort the files):
rename 's/part([0-9])/part0$1/' $INPUT_FASTA.part?
# Create the parallel commands (your blast command goes in the printf("...") part below.
# %s are placements for strings that are kept in the variables at the end of the function($1 - file to process from awk, SUBDIR and FILENAME for output folder and file respectively)
find `pwd` -maxdepth 1 -name "*part*" | awk -v CMD="$CMD" -v PRE="$PREFIX" -v SUF="$SUFFIX" '{n=split($1,a,"/"); SUBDIR=PRE"_results" ;FILENAME=a[n]; "mkdir "SUBDIR" 2>&-" | getline ; printf("%s -query %s > %s/%s.%s\n", CMD, $1, SUBDIR, FILENAME, SUF)}' > $PREFIX.cmds
CMDNUM=$( wc -l < ./$PREFIX.cmds )
# Finally, for running the blastp commands in parallel:
sort $PREFIX.cmds | parallel --gnu --progress -j$JOBS --joblog ./$PREFIX.parallel.log
# check log file and produce to err file:
awk -F"\t" 'NR>1{if ($7>0){print $9}}' ./$PREFIX.parallel.log > ./$PREFIX.failed_cmds
awk -F"\t" 'NR>1{if ($7==0){print $9}}' ./$PREFIX.parallel.log > ./$PREFIX.successful_cmds
FAILED=$( wc -l < ./$PREFIX.failed_cmds )
SUCCEED=$( wc -l < ./$PREFIX.successful_cmds )
# verbose - print commands
if [[ "$FAILED" > 0 || "$SUCCEED" < "$CMDNUM" ]] ; then
if [[ "$VERBOSE" > 0 ]] ; then
set +v
>&2 printf -v int "%i commands failed:\n", $FAILED
>&2 cat ./$PREFIX.failed_cmds
if [[ "$VERBOSE" = 2 ]]; then set -v; fi # verbose - do not echo print commands
fi
read -e -p "Retry running the failed commands? (press enter for Yes, or type N/n): " -i "Yes" RETRY
if [[ "$RETRY" = "Yes" ]]; then
parallel --gnu --progress --retry-failed -j$JOBS --joblog ./$PREFIX.parallel.log
awk -F"\t" 'NR>1{if ($7==0){print $9}}' ./$PREFIX.parallel.log > ./$PREFIX.successful_cmds_retry
RETRY_OK=$( wc -l ./$PREFIX.successful_cmds_retry )
if [[ "$RETRY_OK" = "$CMDNUM" ]] ; then
if [[ "$VERBOSE" > 0 ]]; then set +v; >&2 echo "All command completed successfuly in second attempt, combining output file." ; fi
if [[ "$VERBOSE" = 2 ]]; then set -v; fi # verbose - do not echo print commands
fi
else
>&2 printf "Some failed commands remains, please check log file (%s.parallel.log)\n", $PREFIX
>&2 printf "Combining successful commands into file (%s)\n", "$INPUT_FASTA".$SUFFIX
touch "$INPUT_FASTA".$SUFFIX
find $PREFIX"_results" -type f -name "$INPUT_FASTA.part*.$SUFFIX" | xargs -I '{}' cat '{}' >> "$INPUT_FASTA".$SUFFIX
exit 1
fi
else
if [[ "$VERBOSE" > 0 ]]; then set +v; >&2 echo "All commands completed successfuly, combining output file."; fi
if [[ "$VERBOSE" = 2 ]]; then set -v; fi # verbose - echo commands
# Combine the output files:
TMPFILE="$INPUT_FASTA.$SUFFIX"
touch "$TMPFILE"
find $PREFIX"_results" -type f -name "$INPUT_FASTA.part*.$SUFFIX" | xargs -I '{}' cat '{}' >> "$TMPFILE"
# check that the combined output file exists, and that all commands finished successfuly only then:
if [[ -f "$TMPFILE" ]]; then
if [[ "$OUTFILE" = "-" ]]; then
cat "$TMPFILE"
# exit 0
else
cp "$TMPFILE" "$WORKDIR/$OUTFILE"
if [[ "$VERBOSE" > 0 ]] ; then set +v; >&2 printf "Combined output file can be found at %s\n", "$WORKDIR/$OUTFILE"; fi
if [[ "$VERBOSE" = 2 ]] ; then set -v; fi
fi
if [[ "$KEEP" = false ]]; then
if [[ "$VERBOSE" > 0 ]] ; then set +v; >&2 printf "Returning to working directory and removing temporary files and folders\n"; fi
if [[ "$VERBOSE" = 2 ]] ; then set -v; fi
cd "$WORKDIR"
rm -r "$PREFIX"* $TMPLOC
else
cd "$WORKDIR"
if [[ "$VERBOSE" > 0 ]] ; then set +v; >&2 printf "Type <rm -r %s> to remove the folder containing the temporary files\n", "$TMPLOC"; fi
if [[ "$VERBOSE" = 2 ]] ; then set -v; fi
fi
else
if [[ "$VERBOSE" > 0 ]] ; then set +v; >&2 printf "Could not find final output, check in temporary folder\n", "$TMPLOC"; fi
if [[ "$VERBOSE" = 2 ]] ; then set -v; fi
exit 1
fi
fi
exit 0
@IdoBar
Copy link
Author

IdoBar commented May 16, 2017

An updated version with documentation is available at https://github.com/IdoBar/wrap-a-blast

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment