Last active
December 12, 2015 07:39
-
-
Save avrilcoghlan/4738208 to your computer and use it in GitHub Desktop.
Perl script to run CNVnator on the Sanger compute farm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
run_cnvnator_on_farm_wrapper.pl | |
=head1 SYNOPSIS | |
run_cnvnator_on_farm_wrapper.pl input_fasta input_bam cnvnator window_size output_gff outputdir | |
where input_fasta is the input fasta file for the assembly, | |
input_bam is the input bam file, | |
cnvnator is the path to the CNVnator installation, | |
window_size is the window size (bin size) to use for CNVnator, | |
output_gff is the output gff file, | |
outputdir is the output directory for writing output files. | |
=head1 DESCRIPTION | |
This script takes an input fasta file for an assembly (<input_fasta>), and an | |
input bam file for that assembly (<input_bam>) and runs CNVnator with a certain | |
window size (<window_size). It then converts the CNVnator output into an output | |
gff file (<output_gff>). | |
=head1 VERSION | |
Perl script last edited 6-Feb-2013. | |
=head1 CONTACT | |
[email protected] (Avril Coghlan) | |
=cut | |
# | |
# Perl script run_cnvnator_on_farm_wrapper.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 6-Feb-13. | |
# Last edited 6-Feb-2013. | |
# SCRIPT SYNOPSIS: run_cnvnator_on_farm_wrapper.pl: wrapper script to run CNVnator on the farm, for an input fasta file and input bam file. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 6) | |
{ | |
print "Usage of run_cnvnator_on_farm_wrapper.pl\n\n"; | |
print "perl run_cnvnator_on_farm_wrapper.pl <input_fasta> <input_bam> <cnvnator> <window_size> <output_gff> <outputdir>\n"; | |
print "where <input_fasta> is the input fasta file,\n"; | |
print " <input_bam> is the input bam file,\n"; | |
print " <cnvnator> is the path to the CNVnator installation,\n"; | |
print " <window_size> is the window size (bin size) to use for CNVnator,\n"; | |
print " <output_gff> is the output gff file,\n"; | |
print " <outputdir> is the output directory for writing output files\n"; | |
print "For example, >perl run_cnvnator_on_farm_wrapper.pl /lustre/scratch108/parasites/alt/SVEN/alc_mapping_temp/Cons.fa\n"; | |
print "/lustre/scratch108/parasites/alt/SVEN/alc_mapping_temp/Output/out.sorted.bam /nfs/users/nfs_b/bf3/bin/CNVnator_v0.2.7/src/\n"; | |
print "100 cnvnator_100bp.gff /lustre/scratch108/parasites/alc/StrongyloidesCNVnator/\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT FASTA FILE: | |
my $input_fasta = $ARGV[0]; | |
# FIND THE PATH TO THE INPUT BAM FILE: | |
my $input_bam = $ARGV[1]; | |
# FIND THE PATH TO THE CNVNATOR INSTALLATION: | |
my $cnvnator = $ARGV[2]; | |
# FIND THE WINDOW SIZE TO USE: | |
my $windowsize = $ARGV[3]; | |
# FIND THE PATH TO THE OUTPUT GFF FILE: | |
my $output_gff = $ARGV[4]; | |
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES: | |
my $outputdir = $ARGV[5]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_print_error; | |
&test_make_shellscript($outputdir); | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($outputdir,$input_fasta,$input_bam,$cnvnator,$windowsize,$output_gff); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $input_fasta = $_[1]; # THE INPUT FASTA FILE | |
my $input_bam = $_[2]; # INPUT BAM FILE | |
my $cnvnator = $_[3]; # PATH TO CNVNATOR INSTALLATION | |
my $windowsize = $_[4]; # WINDOW SIZE TO USE | |
my $output_gff = $_[5]; # THE OUTPUT GFF FILE | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $tempdir; # TEMPORARY DIRECTORY TO STORE THE DATA IN | |
my $pattern; # PATTERN OF TEMPORARY FASTA FILES' FILENAMES | |
my $fasta; # A TEMPORARY FASTA FILE | |
my $letters; # LETTERS IN THE NAME OF $fasta | |
my @temp; # | |
my $shellscript; # SHELL SCRIPT FOR SUBMITTING A JOB TO THE FARM | |
my $out; # FILE WITH STANDARD OUTPUT FROM FARM JOB | |
my $err; # FILE WITH STANDARD ERROR FROM FARM JOB | |
my $cmd; # COMMAND TO RUN | |
my $jobgroup; # JOB GROUP TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN JOB GROUP NAME | |
my $running; # VARIABLE THAT SAYS WHETHER THE JOBS ARE STILL RUNNING | |
my $jobs; # DETAILS OF THE JOBS THAT ARE STILL RUNNING | |
my $line; # | |
my $tempout; # TEMPORARY OUTPUT FILE | |
my @shellscripts; # ARRAY OF SHELLSCRIPT NAMES | |
my $i; # | |
my $memory; # MEMORY TO REQUEST FROM THE FARM | |
my $memorytimes1000; # MEMORY TO REQUEST FROM THE FARM | |
# MAKE A TEMPORARY DIRECTORY TO STORE THE DATA IN: | |
($tempdir,$errorcode,$errormsg) = &make_dirname($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
system "mkdir $tempdir"; | |
# DIVIDE UP THE INPUT FASTA FILE INTO SMALLER FILES WITH 5 SEQUENCES EACH: | |
$cmd = "split_up_fasta.pl $input_fasta 5 fasta $tempdir"; | |
system "$cmd"; | |
# SPECIFY A JOB GROUP: | |
$random_number = rand(); | |
$jobgroup = "MyCNVnatorJobGroup".$random_number; | |
$cmd = "bgadd -L 100 \/$jobgroup"; # DEFINE A JOB GROUP WITH ONLY 100 JOBS RUN AT ONCE | |
system "$cmd"; | |
# SUBMIT THE JOBS FOR EACH OF THE SMALLER FILES: | |
chdir( $tempdir ) or die "ERROR: run_main_program: cannot chdir to $tempdir $!"; | |
$pattern = $tempdir."/fasta*"; | |
my $cnt = 0; | |
open(TMP,"ls $pattern |"); | |
while(<TMP>) | |
{ | |
$cnt++; | |
$fasta = $_; | |
chomp $fasta; | |
@temp = split(/fasta/,$fasta); | |
$letters = $temp[$#temp]; | |
# MAKE SHELL SCRIPT TO SUBMIT TO THE FARM AS A JOB: | |
($shellscript,$errorcode,$errormsg) = &make_shellscript($letters,$fasta,$outputdir,$input_bam,$tempdir,$cnvnator,$windowsize); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@shellscripts = (@shellscripts,$shellscript); | |
$out = $shellscript.".o"; | |
$err = $shellscript.".e"; | |
print STDERR "Submitting job $letters (fasta $fasta)...\n"; | |
$memory = 3500; # THIS SEEMS TO WORK OK FOR MOST JOBS | |
$memorytimes1000 = $memory*1000; | |
$cmd = "bsub -g \/$jobgroup -o $tempdir/$out -e $tempdir/$err -R \"select[mem > $memory] rusage[mem=$memory]\" -M$memorytimes1000 $outputdir/$shellscript"; | |
system "$cmd"; | |
} | |
close(TMP); | |
# CHECK IF THE JOBS IN THE JOB GROUP ARE FINISHED: | |
sleep(30); # IT TAKES A FEW SECONDS FOR THE JOBS TO APPEAR IN 'bjobs' | |
$running = 1; | |
while($running == 1) | |
{ | |
$jobs = ""; | |
$cmd = "bjobs -g \/$jobgroup"; | |
open(TMP,"$cmd |"); | |
while(<TMP>) | |
{ | |
$line = $_; | |
$jobs = $jobs.$line; | |
} | |
close(TMP); | |
# xxx print STDERR "Jobs running: $jobs\n"; | |
if (!($jobs =~ /[a-z]/) || $jobs =~ /No unfinished job found/) { $running = 0;} | |
sleep(30); # WAIT ANOTHER 30 SECONDS BEFORE LOOKING AT THE QUEUE AGAIN | |
} | |
# CHECK IF THERE ARE ERRORS IN THE STANDARD OUTPUT ERROR OR STANDARD OUTPUT FILES: | |
chdir( $outputdir ) or die "ERROR: run_main_program: cannot chdir to $outputdir $!"; | |
for ($i = 0; $i <= $#shellscripts; $i++) | |
{ | |
$shellscript = $shellscripts[$i]; | |
$out = $tempdir."/".$shellscript.".o"; | |
$err = $tempdir."/".$shellscript.".e"; | |
open(TMP,"grep Exited $out |"); | |
while(<TMP>) | |
{ | |
$line = $_; | |
if ($line =~ /Exited/) | |
{ | |
$errormsg = "ERROR: run_main_program: found error message in standard output file $out\n"; | |
$errorcode = 9; # ERRORCODE=9 (THIS SHOULDN'T HAPPEN, SO CAN'T TEST FOR) | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
} | |
open(TMP,"grep -i error $out |"); | |
while(<TMP>) | |
{ | |
$line = $_; | |
$line =~ tr/[A-Z]/[a-z]/; | |
if ($line =~ /error/) | |
{ | |
$errormsg = "ERROR: run_main_program: found error message in standard error file $err\n"; | |
$errorcode = 10; # ERRORCODE=10 (THIS SHOULDN'T HAPPEN, SO CAN'T TEST) | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
} | |
} | |
# DELETE THE JOB GROUP: | |
$cmd = "bgdel \/$jobgroup"; | |
system "$cmd"; | |
# PUT ALL THE OUTPUT FILES INTO A TEMPORARY FILE: | |
($tempout,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
$cmd = "cat $tempdir/*out > $tempout"; | |
system "$cmd"; | |
# MAKE A GFF FILE FILE: | |
$cmd = "make_cnvnator_gff.pl $tempout $output_gff $outputdir"; | |
system "$cmd"; | |
# DELETE TEMPORARY FILES: | |
system "rm -f $tempout"; | |
# xxx don't delete as can be deleted before they are run | |
# for ($i = 0; $i <= $#shellscripts; $i++) | |
# { | |
# $shellscript = $shellscripts[$i]; | |
# $shellscript = $outputdir."/".$shellscript; | |
# system "rm -f $shellscript"; | |
# } | |
# system "rm -rf $tempdir"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_shellscript | |
sub test_make_shellscript | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $shellscript; # SHELL SCRIPT FILE | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my $expected_shellscript; # FILE WITH EXPECTED CONTENTS OF $shellscript | |
my $differences; # DIFFERENCES BETWEEN $shellscript AND $expected_shellscript | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
($shellscript,$errorcode,$errormsg) = &make_shellscript('aa','~alc/fasta',$outputdir,'~alc/bam','~alc/tempdir','~alc/cnvnator',30); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_shellscript: failed test1\n"; exit;} | |
($expected_shellscript,$errorcode,$errormsg) = &make_dirname($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_shellscript") || die "ERROR: test_make_shellscript: failed test1\n"; | |
print EXPECTED "#!/usr/bin/env tcsh\n"; | |
print EXPECTED "setenv ROOTSYS \"/nfs/users/nfs_b/bf3/bin/root/\"\n"; | |
print EXPECTED "setenv LD_LIBRARY_PATH \${LD_LIBRARY_PATH}:\${ROOTSYS}/lib\n"; | |
print EXPECTED "run_cnvnator_on_assembly.pl ~alc/fasta ~alc/bam myscriptaa.out ~alc/tempdir ~alc/cnvnator 30\n"; | |
close(EXPECTED); | |
$differences = ""; | |
$shellscript = $outputdir."/".$shellscript; | |
open(TEMP,"diff $shellscript $expected_shellscript |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_shellscript: failed test1 (shellscript $shellscript expected_shellscript $expected_shellscript)\n"; exit;} | |
system "rm -f $shellscript"; | |
system "rm -f $expected_shellscript"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE SHELL SCRIPT TO SUBMIT TO THE FARM AS A JOB: | |
# SUBROUTINE SYNOPSIS: make_shellscript(): make a shellscript to submit to the farm | |
sub make_shellscript | |
{ | |
my $letters = $_[0]; # LETTERS TO USE TO IDENTIFY SHELL SCRIPT, eg. aa | |
my $fasta = $_[1]; # FASTA FILE | |
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES INTO | |
my $input_bam = $_[3]; # INPUT BAM FILE | |
my $tempdir = $_[4]; # TEMPORARY DIRECTORY THAT THE OUTPUT FILE WILL APPEAR IN | |
my $cnvnator = $_[5]; # PATH TO THE CNVNATOR INSTALLATION | |
my $windowsize = $_[6]; # THE WINDOW SIZE TO USE FOR CNVNATOR | |
my $shellscript; # SHELL SCRIPT | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $output; # OUTPUT FILE | |
my @temp; # | |
$output = "myscript".$letters.".out"; | |
$shellscript = $outputdir."/myscript".$letters; | |
open(SCRIPT,">$shellscript") || die "ERROR: make_shellscript: cannot open shellscript $shellscript\n"; | |
print SCRIPT "#!/usr/bin/env tcsh\n"; | |
print SCRIPT "setenv ROOTSYS \"/nfs/users/nfs_b/bf3/bin/root/\"\n"; | |
print SCRIPT "setenv LD_LIBRARY_PATH \${LD_LIBRARY_PATH}:\${ROOTSYS}/lib\n"; | |
print SCRIPT "run_cnvnator_on_assembly.pl $fasta $input_bam $output $tempdir $cnvnator $windowsize\n"; | |
close(SCRIPT); | |
system "chmod a+x $shellscript"; | |
@temp = split(/\//,$shellscript); | |
$shellscript = $temp[$#temp]; | |
return($shellscript,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO MAKE A NAME FOR A TEMPORARY DIRECTORY: | |
# SUBROUTINE SYNOPSIS: make_dirname(): make a directory name for a temporary directory | |
sub make_dirname | |
{ | |
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY DIRECTORY TO | |
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A DIRECTORY NAME YET | |
my $dirname = "none";# NEW DIRECTORY NAME TO USE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $poss_dirname; # POSSIBLE DIRECTORY NAME TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY DIRECTORY NAME | |
while($found_name == 0) | |
{ | |
$random_number = rand(); | |
$poss_dirname = $outputdir."/tmp".$random_number; | |
if (!(-e $poss_dirname)) | |
{ | |
$dirname = $poss_dirname; | |
$found_name = 1; | |
} | |
} | |
if ($found_name == 0 || $dirname eq 'none') | |
{ | |
$errormsg = "ERROR: make_dirame: found_name $found_name dirname $dirname\n"; | |
$errorcode = 7; # ERRORCODE=7 (SHOULDN'T HAPPEN, SO CAN'T TEST) | |
return($dirname,$errorcode,$errormsg); | |
} | |
return($dirname,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE: | |
sub make_filename | |
{ | |
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO | |
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET | |
my $filename = "none";# NEW FILE NAME TO USE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $poss_filename; # POSSIBLE FILE NAME TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
while($found_name == 0) | |
{ | |
$random_number = rand(); | |
$poss_filename = $outputdir."/tmp".$random_number; | |
if (!(-e $poss_filename)) | |
{ | |
$filename = $poss_filename; | |
$found_name = 1; | |
} | |
} | |
if ($found_name == 0 || $filename eq 'none') | |
{ | |
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n"; | |
$errorcode = 6; # ERRORCODE=6 (SHOULDN'T HAPPEN, SO CAN'T TEST) | |
return($filename,$errorcode,$errormsg); | |
} | |
return($filename,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment