Last active
October 6, 2017 10:34
-
-
Save IdoBar/06b04ab75aac01ad76c06e22812ab9d3 to your computer and use it in GitHub Desktop.
A bash script using GNU-Parallel to split fasta file, process it in parallel (Blast, HMMscan, etc.) and combine the output parts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -v | |
# Define these according to the run | |
N=5000 | |
NCPUS=4 | |
# Use wildcards or specify an exact input fasta filename below (should be in your current working directory) | |
INPUT_FASTA="input.fasta" | |
DATE=`date +%d_%m_%Y` | |
REFDB="nt" | |
#CMD=`printf "hmmscan --cpu %s --domtblout \%s/\%s.pfam.domtblout ~/.hmmer-3.1/Pfam/Pfam-A.hmm \%s > \%s/\%s_pfam.log",$NCPUS` | |
# update PREFIX to your liking (a nice, short descriptive name of your split analysis) | |
PREFIX="split_longest_ORF_"$DATE | |
# For blast, set | |
SUFFIX="outfmt6" | |
# For HMMER | |
#SUFFIX="pfam.domtblout" | |
mkdir $PREFIX 2>/dev/null | |
cd $PREFIX | |
cat ../$INPUT_FASTA | parallel --gnu -kN$N --no-run-if-empty --recstart '>' --pipe "cat > $INPUT_FASTA.part{#}" | |
# remove empty files: | |
# find . -size 0 -name "$INPUT_FASTA.part*" | xargs rm | |
# Add a trailing '0' to parts 1-9 (to be able to sort the files): | |
rename -v 's/part([0-9])/part0$1/' $INPUT_FASTA.part? | |
# Create the parallel commands (your blast command goes in the printf("...") part below. | |
# %s are placements for strings that are kept in the variables at the end of the function($1 - file to process from awk, SUBDIR and FILENAME for output folder and file respectively) | |
#find `pwd` -maxdepth 1 -name "*part*" | awk -v PRE="$PREFIX" -v CPUS="$NCPUS" -v SUF="$SUFFIX" '{n=split($1,a,"/"); SUBDIR=PRE"_results" ;FILENAME=a[n]; "mkdir "SUBDIR" 2>&-" | getline ; printf "hmmscan --cpu %s --domtblout %s/%s.%s ~/.hmmer-3.1/Pfam/Pfam-A.hmm %s > %s_%s_pfam.log\n",CPUS, SUBDIR, FILENAME, SUF, $1, PRE, FILENAME}' > $PREFIX.cmds | |
# For blast, use this line: | |
find `pwd` -maxdepth 1 -name "*part*" | awk -v PRE="$PREFIX" -v CPUS="$NCPUS" -v SUF="$SUFFIX" '{n=split($1,a,"/"); SUBDIR=PRE"_results" ;FILENAME=a[n]; "mkdir "SUBDIR" 2>&-" | getline ; printf("blastn -db nt -outfmt \"6 std stitle staxids sscinames sskingdom\" -max_target_seqs 1 -max_hsps 1 -evalue 1e-5 -query %s -num_threads %s > %s/%s.%s\n", $1, CPUS, SUBDIR, FILENAME, SUF)}' > $PREFIX.cmds | |
# Finally, for running the blastp commands: | |
sort $PREFIX.cmds | parallel --gnu --progress --retries 4 --resume-failed -j50% --joblog ./$PREFIX.parallel.log | |
# -j switch will determine how many of the server CPUs to use in total (try -j25% if the system is overloaded). | |
# check log file and produce to err file: | |
awk -F"\t" 'NR>1{if ($7>0){print $9}}' ./$PREFIX.parallel.log > ./$PREFIX.failed_cmds | |
awk -F"\t" 'NR>1{if ($7==0){print $9}}' ./$PREFIX.parallel.log > ./$PREFIX.successful_cmds | |
FAILED=$( cat ./$PREFIX.failed_cmds | wc -l ) | |
set +v | |
if [ $FAILED = 0 ]; then | |
echo "All command completed successfuly, combining output file and removing temp files" | |
set -v | |
# Combine the output files: | |
cd $PREFIX"_results" | |
OUTFILE=$INPUT_FASTA.$SUFFIX | |
touch ../$OUTFILE | |
# cp $INPUT_FASTA.part01.$SUFFIX | |
# rename -v 's/part01\.//' $INPUT_FASTA.part*.$SUFFIX | |
# OUTFILE=`ls -1 -I "*part*"` | |
find . $INPUT_FASTA.part*.$SUFFIX | xargs -I '{}' cat '{}' >> ../$OUTFILE | |
# check that the combined output file exists, only then: | |
# cp $OUTFILE ../ && | |
cd .. | |
# check that the combined output file exists, and that all commands finished successfuly only then: | |
if [ -f "$OUTFILE" ]; then | |
set +v | |
printf "Combined output file can be found at %s/%s\n", `pwd` $OUTFILE | |
printf "type \"rm -r %s_results\" to remove the folder containing the completed parts\n", $PREFIX | |
fi | |
# cp $OUTFILE ../ && cd .. | |
else | |
printf "%i commands failed:\n", $FAILED | |
cat ./$PREFIX.failed_cmds | |
printf "Try to repeat the failed commands by running \"bash %s.failed_cmds\"\n", $PREFIX | |
fi | |
# Add a flag to determine whether to remove split fasta (remove the entire folder or to keep it). | |
#cd .. && rm -r $PREFIX | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment