Created
February 28, 2013 13:46
-
-
Save avrilcoghlan/5056826 to your computer and use it in GitHub Desktop.
Perl script that reads in a cigar-format alignment for a TreeFam familiy, and makes a HMM for the family
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
translate_treefam_cigars_to_hmms.pl | |
=head1 SYNOPSIS | |
translate_treefam_cigars_to_hmm.pl treefam_version hmms_output outputdir cigars alntype family hmmer_bin alns_output map_output | |
=head1 DESCRIPTION | |
For a TreeFam family of interest (<family>), this script reads in the cigar-format | |
alignment from a file with cigar-format TreeFam alignments (<cigars>), for | |
a particular version of the TreeFam database, and translates the cigar alignment | |
for the family of interest into a fasta format alignment, and then builds a HMM for | |
the alignment using HMMER. The argument <alntype> says whether "full" or "seed" | |
alignments should be used. The HMM is written in the output file <hmms_output>. | |
The multiple alignment is written in the output file <alns_output>. The hmmbuild | |
map output is written to file <map_output>. <hmmer_bin> is the directory containing | |
the hmmer executables. | |
=head1 VERSION | |
Perl script last edited 23-Oct-2012. | |
=head1 CONTACT | |
[email protected] (Avril Coghlan) | |
=cut | |
# | |
# Perl script translate_treefam_cigars_to_hmms.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 23-Oct-12. | |
# Last edited 23-Oct-2012. | |
# SCRIPT SYNOPSIS: translate_treefam_cigars_to_hmms.pl: reads in a cigar-format alignment for a TreeFam familiy, and makes a HMM for the family | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
use DBI; | |
use Devel::Size qw(size total_size); | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 9) | |
{ | |
print "Usage of translate_treefam_cigars_to_hmms.pl\n\n"; | |
print "perl translate_treefam_cigars_to_hmms.pl <treefam_version> <hmms_output> <outputdir> <cigars> <alntype> <family> <hmmer_bin> <alns_output> <map_output>\n"; | |
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n"; | |
print " <hmms_output> is the output file of HMMs,\n"; | |
print " <outputdir> is the directory for writing output files,\n"; | |
print " <cigars> is the file with cigar-format alignments,\n"; | |
print " <family> is the family of interest,\n"; | |
print " <hmmer_bin> is the directory containing the hmmer executable\n"; | |
print "For example, >perl translate_treefam_cigars_to_hmms.pl 7 myhmms\n"; | |
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam alignments full TF101000\n"; | |
print "/software/hmmer/bin/ myalns mymap\n"; | |
exit; | |
} | |
# FIND THE TREEFAM VERSION TO USE: | |
my $treefam_version = $ARGV[0]; | |
# FIND THE OUTPUT FILE NAME OF HMMs: | |
my $hmms_output = $ARGV[1]; | |
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO: | |
my $outputdir = $ARGV[2]; | |
# FIND THE FILE WITH CIGAR-FORMAT ALIGNMENTS: | |
my $cigars = $ARGV[3]; | |
# FIND THE TYPE OF ALIGNMENT TO USE: | |
my $alntype = $ARGV[4]; | |
# FIND THE FAMILY OF INTEREST: | |
my $family = $ARGV[5]; | |
# FIND THE DIRECTORY WITH THE HMMER EXECUTABLES: | |
my $hmmer_bin = $ARGV[6]; | |
# FIND THE OUTPUT MULTIPLE ALIGNMENT FILE: | |
my $alns_output = $ARGV[7]; | |
# FIND THE hmmbuild MAP OUTPUT FILE: | |
my $map_output = $ARGV[8]; | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
my $PRINT_WARNINGS = 0; # SAYS WHETHER TO PRINT WARNINGS ABOUT MISSING DATA. | |
&test_print_error; | |
&test_get_fasta_aln_for_seq; | |
&test_make_hmm($outputdir); | |
&test_add_sequences_to_cigarfile($outputdir); | |
&test_remove_nonunique_seqs_from_cigarfile($outputdir); | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($treefam_version,$hmms_output,$outputdir,$cigars,$alntype,$family,$hmmer_bin,$alns_output,$map_output); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $version = $_[0]; ## THE TREEFAM VERSION TO USE. | |
my $output = $_[1]; ## THE OUTPUT FILE. | |
my $outputdir = $_[2]; ## DIRECTORY TO WRITE OUTPUT FILES TO. | |
my $cigarfile = $_[3]; ## THE FILE WITH TREEFAM CIGAR-FORMAT ALIGNMENT. | |
my $alntype = $_[4]; ## FIND THE TYPE OF ALIGNMENT TO USE | |
my $family = $_[5]; ## THE FAMILY OF INTEREST | |
my $hmmer_build = $_[6]; ## DIRECTORY CONTAINING THE HMMER EXECUTABLES | |
my $alns_output = $_[7]; ## OUTPUT MULTIPLE ALIGNMENT FILE | |
my $map_output = $_[8]; ## hmmbuild MAP OUTPUT FILE | |
my $errorcode; ## RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; ## RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $new_cigarfile; ## CIGAR FILE WITH SEQUENCES WITH NON-UNIQUE DISPLAY_IDs REMOVED | |
my $new_cigarfile2; ## CIGAR FILE WITH DISPLAY_IDs REPLACED BY SEQUENCE IDs | |
my $new_cigarfile3; ## CIGAR FILE WITH THE SEQUENCES ADDED | |
# REPLACE THE DISPLAY_IDs IN THE CIGAR-FORMAT ALIGNMENT FILE WITH THE SEQUENCE IDENTIFIERS: | |
($new_cigarfile,$errorcode,$errormsg) = &replace_display_ids_with_seq_ids($cigarfile,$version,$alntype,$family,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# WRITE THE SEQUENCES IN THE CIGAR-FORMAT ALIGNMENT FILE: | |
($new_cigarfile2,$errorcode,$errormsg) = &add_sequences_to_cigarfile($new_cigarfile,$version,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
system "rm -f $new_cigarfile"; | |
# JUST TAKE SEQUENCES WITH UNIQUE DISPLAY_IDs IN THE CIGAR-FORMAT ALIGNMENT FILE: | |
($new_cigarfile3,$errorcode,$errormsg) = &remove_nonunique_seqs_from_cigarfile($new_cigarfile2); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
system "rm -f $new_cigarfile2"; | |
# CONVERT THE CIGAR-FORMAT ALIGNMENT FILE TO A HMM FILE: | |
($errorcode,$errormsg) = &make_hmm_from_cigar_alignment($new_cigarfile3,$hmmer_bin,$output,$family,$alns_output,$map_output); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
system "rm -f $new_cigarfile3"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &add_sequences_to_cigarfile | |
sub test_add_sequences_to_cigarfile | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $random_number; # NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $cigarfile; # THE CIGAR FILE | |
my $new_cigarfile; # NAME OF THE NEW CIGAR FILE | |
my $expected_cigarfile; # FILE CONTAINING EXPECTED CONTENTS OF $new_cigarfile | |
my $differences; # DIFFERENCES BETWEEN $new_cigarfile AND $expected_cigarfile | |
my $length_differences; # LENGTH OF $differences | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
# OPEN A NEW FILE FOR WRITING THE NEW CIGAR-FORMAT ALIGNMENT: | |
$random_number = rand(); | |
$cigarfile = $outputdir."/tmp".$random_number; | |
open(CIGARFILE,">$cigarfile") || die "ERROR: test_add_sequences_to_cigarfile: cannot open $cigarfile\n"; | |
print CIGARFILE "BGIOSIFCE007851.1_species BGIOSIFCE007851.1 50D21M1D1M1D8M1D7M3D2M31D4M3D5M38D6M68D11M15D16M2D27M5D16M4D1M1D21M6D4M6D4M10D46M20D75M1D35M3D24M1D27M23D29M158D\n"; | |
print CIGARFILE "Bm1_12035A_species Bm1_12035A 12D78M3D89M66D16M11D17M2D27M4D17M4D1M1D21M7D3M4D6M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M28D26M76D28M3D11M40D\n"; | |
print CIGARFILE "ZK507.6.1_species ZK507.6.1 26D47M1D21M31D12M24D20M67D12M14D17M2D24M8D16M4D23M4D16M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M26D28M76D28M3D27M24D\n"; | |
close(CIGARFILE); | |
# OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $new_cigarfile: | |
$random_number = rand(); | |
$expected_cigarfile = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_cigarfile") || die "ERROR: test_add_sequences_to_cigarfile: cannot open $expected_cigarfile\n"; | |
print EXPECTED "BGIOSIFCE007851.1_species BGIOSIFCE007851.1 50D21M1D1M1D8M1D7M3D2M31D4M3D5M38D6M68D11M15D16M2D27M5D16M4D1M1D21M6D4M6D4M10D46M20D75M1D35M3D24M1D27M23D29M158D MDSIMEPYVADLLADDITASMVELLSGDGGAAQMDVGVLDAYLRAIGALPAHPAAPGADLAAAAEVESMASNDDTNGNWDTKVDAKVPSAFLPPPPGFPPLPVPALANEPVYAAPVDEGDAIRAFMQQLEWSEQYNGDDDAPAPDDSMASRPQLCAPYDDDIDANLRAMEKDAAERPSPDYLDTVHNGQISAASRASLVAWMGRLTHRYELAAGTLHRAVSYFDRFLSARALPSYTEHQLSLVGATAVYTAAKYEDQGTVFKLDAREIASYGEFASAQEVLAMEREMMAALGYRLGGPNAETFVEHFTRYSKGKEELRVQRLARHIADRSLESYGCLGYLPSVVAAAVISIARWTLNPPGALPWSSELHELTGYSSQHISSCVLTVLNTQ\n"; | |
print EXPECTED "Bm1_12035A_species Bm1_12035A 12D78M3D89M66D16M11D17M2D27M4D17M4D1M1D21M7D3M4D6M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M28D26M76D28M3D11M40D MNGASDENAENRFGEENEENEQFLRRKGQRKSRPHALAYRNTNISGSVSKKYEEKGVYISIIEKSLLLDKKRRRESFQVLFIFLCQRNQEPPTRSRWLPTGISAFTVFCDEDNGQEVGSSKDPEKNDQQQGEGSKRSGEMSTQYLQIPRKIRRPLATLSPNRDDSDSESNRPLARNLSPNHEESDLESSTPSDANLQTDEGEDSYHSRNSTTSIDSSDISANELRPDDEVVFVVDSVSNNVRRAFKSVPDISSQAATTSSSIIYTTEEHEDRVRTDPVYDADIYLYMRYRELKLCPSGDFLEFQEEICGEVRYLLVEWICDSAREFNLSTESLHMAVSIVDRVLNTLQCPREKLQLLGAAALLLAAKFEEIYPPEVKEFARITAYTFHETEIIRMERIVFARVEYQLLAPTSWWFASRLVRMAHVPRIIYCMMRYLLELALLDHTFLDFRPSVIGAASFFLSNVIFKSDYLLLTTETDIGVNELRSPARKMLELFHEVPNKDYCSVFEKYATEKYEQVSCLLLPDNFSELDVTR\n"; | |
print EXPECTED "ZK507.6.1_species ZK507.6.1 26D47M1D21M31D12M24D20M67D12M14D17M2D24M8D16M4D23M4D16M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M26D28M76D28M3D27M24D MRSALSLKPSNGNAAKSQAVNNKNVIKNAPLGGKLTRQIGTSNLLQQALPSKKIDESPIIKIDAKDSFKVFEDQEPEKENSSENVDATEKDSNVIPAEDNNMIHELERKMEEKSRAEKLKFKFMQTRDNSDITSRFSEPPSEFSVLCDDDDCDKVSVASSTFTTSVRATFSSFHFDENQRKKEFGKEEAVKKIQKKAAKEARDDSMFSSEEFFPDIIKYMLHRQTKNRASHECFDIQSQVNEEMRTILIDWFSDVVKEYNFQKETFHLAVSLVDRALSMFNIDKMRFQLVGTTSMMIAVKYEEIFPPEIEDFALITDNTYRVPDILLMERFLLGKFDFVVAMPTSSWFGTCFAKRMNFTKKMRNTVHYLLELSLIDVHFLRYRPSDIAAAACCFANLQADVESWPQKMVDDTGISTEDFVDVLRDLHRMYLNASTADFKSIFYNYSETAQMEVALLPAPTDKLRSMFPSIFVTAPKSSNDSSSPQ\n"; | |
close(EXPECTED); | |
# RUN &add_sequences_to_cigarfile: | |
($new_cigarfile,$errorcode,$errormsg) = &add_sequences_to_cigarfile($cigarfile,"7",$outputdir); | |
$differences = ""; | |
if ($new_cigarfile eq '') { print STDERR "ERROR: test_add_sequences_to_cigarfile: failed test1 (new_cigarfile $new_cigarfile)\n"; exit;} | |
open(TEMP,"diff $new_cigarfile $expected_cigarfile |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0 || $errorcode != 0) { print STDERR "ERROR: test_add_sequences_to_cigarfile: failed test1\n"; exit;} | |
# DELETE TEMPORARY FILES: | |
system "rm -f $new_cigarfile"; | |
system "rm -f $expected_cigarfile"; | |
system "rm -f $cigarfile"; | |
} | |
#------------------------------------------------------------------# | |
# WRITE THE SEQUENCES IN THE CIGAR-FORMAT ALIGNMENT FILE: | |
sub add_sequences_to_cigarfile | |
{ | |
my $cigarfile = $_[0]; ## THE CIGAR FILE | |
my $version = $_[1]; ## VERSION OF THE TREEFAM DATABASE TO USE | |
my $outputdir = $_[2]; ## THE DIRECTORY TO WRITE OUTPUT FILES TO | |
my $new_cigarfile; ## NEW CIGAR FILE WITH THE SEQUENCES OF INTEREST | |
my $id; ## SEQUENCE IDENTIFIER FOR THE DISPLAY ID | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my @temp; ## | |
my $line; ## | |
my $st; ## | |
my $sth; ## | |
my $rv; ## | |
my @array; ## | |
my $database; ## THE TREEFAM DATABASE TO USE | |
my $dbh; ## | |
my $sequence; ## THE SEQUENCE FOR THE IDENTIFIER | |
my $cigar; ## THE CIGAR FOR THE IDENTIFIER | |
my $table_w; ## THE DATABASE TABLE TO USE | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
my $display_id; ## THE DISPLAY_ID | |
# OPEN A NEW FILE FOR WRITING THE NEW CIGAR-FORMAT ALIGNMENT: | |
$random_number = rand(); | |
$new_cigarfile = $outputdir."/tmp".$random_number; | |
open(NEW_CIGARFILE,">$new_cigarfile") || die "ERROR: add_sequences_to_cigarfile: cannot open $new_cigarfile\n"; | |
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE: | |
$database = "treefam_".$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET THE ALIGNMENT FOR EACH FAMILY FROM THE DATABASE: | |
$table_w = "aa_seq"; | |
open(CIGARFILE,"$cigarfile") || die "ERROR: cannot open $cigarfile\n"; | |
while(<CIGARFILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$display_id = $temp[0]; | |
$id = $temp[1]; | |
$cigar = $temp[2]; | |
$st = "SELECT SEQ from $table_w WHERE ID=\'$id\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
$sequence = "none"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$sequence = $array[0]; | |
} | |
} | |
print NEW_CIGARFILE "$display_id $id $cigar $sequence\n"; | |
} | |
close(CIGARFILE); | |
close(NEW_CIGARFILE); | |
$dbh->disconnect(); | |
return($new_cigarfile,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# REPLACE THE DISPLAY_IDs IN THE CIGAR-FORMAT ALIGNMENT FILE WITH THE SEQUENCE IDENTIFIERS: | |
# NOTE: THIS IS HARD TO TEST AS IT EXPECTS THAT $new_idfile AND $new_cigarfile WILL HAVE THE SAME EXACT IDENTIFIERS, | |
# AND I WOULD NEED A VERY SMALL FAMILY TO USE AS A TEST CASE. | |
# SUBROUTINE SYNOPSIS: replace_display_ids_with_seq_ids(): replaces display IDs in a cigar-format alignment for a TreeFam family with sequence IDs | |
sub replace_display_ids_with_seq_ids | |
{ | |
my $cigarfile = $_[0]; ## THE CIGAR FILE | |
my $version = $_[1]; ## VERSION OF THE TREEFAM DATABASE TO USE | |
my $aln_type = $_[2]; ## TYPE OF ALIGNMENT TO USE | |
my $family = $_[3]; ## TREEFAM FAMILY | |
my $outputdir = $_[4]; ## DIRECTORY TO WRITE OUTPUT FILES TO | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my $new_cigarfile = "none";## THE NEW CIGAR FILE, SORTED BY DISPLAY_ID | |
my $new_cigarfile2 = "none";## THE NEW CIGAR FILE, WITH ID INSTEAD OF DISPLAY_ID | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $idfile; ## TEMPORARY FILE FOR PUTTING THE IDENTIFIERS FOR THE FAMILY | |
my $new_idfile; ## A SORTED VERSION OF $idfile | |
my $new_idfile_lineno; ## NUMBER OF LINES IN $new_idfile | |
my $new_cigarfile_lineno; ## NUMBER OF LINES IN $new_cigarfile | |
my $cmd; ## COMMAND TO RUN | |
# SORT THE CIGAR FILE BY THE DISPLAY_ID: | |
$random_number = rand(); | |
$new_cigarfile = $outputdir."/tmp".$random_number; | |
system "sort -k1,1 $cigarfile > $new_cigarfile"; | |
# GET THE IDENTIFIERS FOR THE FAMILY IN A TEMPORARY FILE: | |
($idfile,$errorcode,$errormsg) = &get_ids_for_family($family,$version,$aln_type,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# SORT THE IDENTIFIERS FILE BY DISPLAY_ID: | |
$random_number = rand(); | |
$new_idfile = $outputdir."/tmp".$random_number; | |
system "sort -k1,1 $idfile > $new_idfile"; | |
# WE CAN NOW DELETE $idfile: | |
system "rm -f $idfile"; | |
# CHECK THAT $new_cigarfile AND $new_idfile HAVE THE SAME NUMBER OF LINES: | |
open(TMP,"wc -l $new_idfile | cut -d\" \" -f1 |"); | |
while(<TMP>) | |
{ | |
$new_idfile_lineno = $_; | |
chomp $new_idfile_lineno; | |
} | |
close(TMP); | |
open(TMP,"wc -l $new_cigarfile | cut -d\" \" -f1 |"); | |
while(<TMP>) | |
{ | |
$new_cigarfile_lineno = $_; | |
chomp $new_cigarfile_lineno; | |
} | |
close(TMP); | |
if ($new_idfile_lineno != $new_cigarfile_lineno) | |
{ | |
$errormsg = "ERROR: replace_display_ids_with_seq_ids: new_idfile_lineno $new_idfile_lineno ($new_idfile) new_cigarfile_lineno $new_cigarfile_lineno ($new_cigarfile)\n"; | |
$errorcode = 2; # ERRORCODE=2 | |
return($new_cigarfile2,$errorcode,$errormsg); | |
} | |
# MAKE A NEW CIGAR FILE WITH THE DISPLAY_ID, ID COLUMN FROM $new_idfile_lineno AND THE CIGAR COLUMN FROM $new_cigarfile_lineno: | |
$random_number = rand(); | |
$new_cigarfile2 = $outputdir."/tmp".$random_number; | |
$cmd = "paste $new_idfile $new_cigarfile | awk '{print \$1,\$2,\$4}' > $new_cigarfile2"; | |
system "$cmd"; | |
# WE CAN NOW DELETE $new_cigarfile: | |
system "rm -f $new_cigarfile"; | |
system "rm -f $new_idfile"; | |
return($new_cigarfile2,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &remove_nonunique_seqs_from_cigarfile | |
sub test_remove_nonunique_seqs_from_cigarfile | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $random_number; # NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $cigarfile; # THE CIGAR FILE | |
my $new_cigarfile; # NAME OF THE NEW CIGAR FILE | |
my $expected_cigarfile; # FILE CONTAINING EXPECTED CONTENTS OF $new_cigarfile | |
my $differences; # DIFFERENCES BETWEEN $new_cigarfile AND $expected_cigarfile | |
my $length_differences; # LENGTH OF $differences | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
# OPEN A NEW FILE FOR WRITING THE NEW CIGAR-FORMAT ALIGNMENT: | |
$random_number = rand(); | |
$cigarfile = $outputdir."/tmp".$random_number; | |
open(CIGARFILE,">$cigarfile") || die "ERROR: test_add_sequences_to_cigarfile: cannot open $cigarfile\n"; | |
print CIGARFILE "BGIOSIFCE007851.1_species BGIOSIFCE007851.1 50D21M1D1M1D8M1D7M3D2M31D4M3D5M38D6M68D11M15D16M2D27M5D16M4D1M1D21M6D4M6D4M10D46M20D75M1D35M3D24M1D27M23D29M158D AAA\n"; | |
print CIGARFILE "Bm1_12035A_species Bm1_12035A 12D78M3D89M66D16M11D17M2D27M4D17M4D1M1D21M7D3M4D6M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M28D26M76D28M3D11M40D GGG\n"; | |
print CIGARFILE "ZK507.6.1_species ZK507.6.1 26D47M1D21M31D12M24D20M67D12M14D17M2D24M8D16M4D23M4D16M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M26D28M76D28M3D27M24D CCC\n"; | |
print CIGARFILE "Bm1_12035A_species Bm1_12035A 12D78M3D89M66D16M11D17M2D27M4D17M4D1M1D21M7D3M4D6M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M28D26M76D28M3D11M40D TTT\n"; | |
close(CIGARFILE); | |
# OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $new_cigarfile: | |
$random_number = rand(); | |
$expected_cigarfile = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_cigarfile") || die "ERROR: test_add_sequences_to_cigarfile: cannot open $expected_cigarfile\n"; | |
print EXPECTED "BGIOSIFCE007851.1_species BGIOSIFCE007851.1 50D21M1D1M1D8M1D7M3D2M31D4M3D5M38D6M68D11M15D16M2D27M5D16M4D1M1D21M6D4M6D4M10D46M20D75M1D35M3D24M1D27M23D29M158D AAA\n"; | |
print EXPECTED "ZK507.6.1_species ZK507.6.1 26D47M1D21M31D12M24D20M67D12M14D17M2D24M8D16M4D23M4D16M10D17M2D11M1D2M1D12M20D28M2D2M1D23M3D51M4D3M1D20M1D25M26D28M76D28M3D27M24D CCC\n"; | |
close(EXPECTED); | |
# RUN &remove_nonunique_seqs_from_cigarfile: | |
($new_cigarfile,$errorcode,$errormsg) = &remove_nonunique_seqs_from_cigarfile($cigarfile); | |
$differences = ""; | |
open(TEMP,"diff $new_cigarfile $expected_cigarfile |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0 || $errorcode != 0) { print STDERR "ERROR: test_remove_nonunique_seqs_from_cigarfile: failed test1\n"; exit;} | |
# DELETE TEMPORARY FILES: | |
system "rm -f $new_cigarfile"; | |
system "rm -f $expected_cigarfile"; | |
system "rm -f $cigarfile"; | |
# RUN &remove_nonunique_seqs_from_cigarfile: | |
($new_cigarfile,$errorcode,$errormsg) = &remove_nonunique_seqs_from_cigarfile(""); | |
if ($errorcode != 3) { print STDERR "ERROR: test_remove_nonunique_seqs_from_cigarfile: failed test2\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# JUST TAKE SEQUENCES WITH UNIQUE DISPLAY_IDs IN THE CIGAR-FORMAT ALIGNMENT FILE: | |
sub remove_nonunique_seqs_from_cigarfile | |
{ | |
my $cigarfile = $_[0]; ## THE FILE WITH THE CIGAR-FORMAT ALIGNMENT | |
my %CNT = (); ## COUNT OF TIMES A SEQUENCE APPEARS IN THE CIGAR-FORMAT FILE | |
my $display_id; ## DISPLAY_ID FOR A SEQUENCE IN THE CIGAR-FORMAT FILE | |
my $line; ## | |
my @temp; ## | |
my $new_cigarfile = "none";## THE NEW CIGAR FILE | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my $id; ## IDENTIFIER FOR A SEQUENCE | |
# FIND OUT HOW MANY TIMES EACH DISPLAY_ID APPEARS IN $cigarfile: | |
if ($cigarfile eq '') | |
{ | |
$errormsg = "ERROR: remove_nonunique_seqs_from_cigarfile: cigarfile $cigarfile\n"; | |
$errorcode = 3; # ERRORCODE=3 | |
return($new_cigarfile,$errorcode,$errormsg); | |
} | |
open(CIGARFILE,"$cigarfile") || die "ERROR: cannot open $cigarfile\n"; | |
while(<CIGARFILE>) | |
{ | |
$line = $_; | |
@temp = split(/\s+/,$line); | |
if ($#temp == 3) | |
{ | |
$display_id = $temp[0]; | |
if (!($CNT{$display_id})) { $CNT{$display_id} = 1;} | |
else { $CNT{$display_id}++; } | |
} | |
} | |
close(CIGARFILE); | |
# WRITE A NEW FILE WITH JUST THOSE DISPLAY_IDs THAT APPEARED ONCE IN $cigarfile: | |
$random_number = rand(); | |
$new_cigarfile = $outputdir."/tmp".$random_number; | |
open(NEW_CIGARFILE,">$new_cigarfile") || die "ERROR: remove_nonunique_seqs_from_cigarfile: cannot open $new_cigarfile\n"; | |
open(CIGARFILE,"$cigarfile") || die "ERROR: cannot open $cigarfile\n"; | |
while(<CIGARFILE>) | |
{ | |
$line = $_; | |
@temp = split(/\s+/,$line); | |
if ($#temp == 3) | |
{ | |
$display_id = $temp[0]; | |
if (!($CNT{$display_id})) | |
{ | |
$errormsg = "ERROR: remove_nonunique_seqs_from_cigarfile: do not now count for $display_id\n"; | |
$errorcode = 1; # ERRORCODE=1 | |
return($new_cigarfile,$errorcode,$errormsg); | |
} | |
if ($CNT{$display_id} == 1) | |
{ | |
print NEW_CIGARFILE "$line"; | |
} | |
} | |
# IGNORE THE FIRST LINE OF $cigarfile (FAMILY NAME) AND LAST LINE (JUST SAYS '#END') | |
} | |
close(CIGARFILE); | |
close(NEW_CIGARFILE); | |
return($new_cigarfile,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE CIGAR-FORMAT ALIGNMENTS FOR THE FAMILIES: | |
sub make_hmm_from_cigar_alignment | |
{ | |
my $cigarfile = $_[0]; ## THE FILE WITH THE CIGAR-FORMAT ALIGNMENTS | |
my $hmmer_bin = $_[1]; ## DIRECTORY CONTAINING THE HMMER EXECUTABLES | |
my $output = $_[2]; ## THE OUTPUT FILE | |
my $family = $_[3]; ## THE FAMILY | |
my $mfafile = $_[4]; ## MULTIPLE SEQUENCE ALIGNMENT FILE | |
my $map_output = $_[5]; ## hmmbuild MAP OUTPUT FILE | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; ## | |
my @temp; ## | |
my $cigar; ## CIGAR STRING FOR A SEQUENCE | |
my $mfa_line_length = "none";## LENGTH OF THE ENTRIES IN THE MULTIPLE ALIGNMENT | |
my $sequence; ## SEQUENCE FOR A IDENTIFIER | |
my $id; ## IDENTIFIER | |
my $fasta; ## FASTA FORMAT ALIGNMENT FOR SEQUENCE | |
my $random_number; ## RANDOM NUMBER FOR TEMPORARY FILE NAMES | |
my $display_id; ## DISPLAY_ID FOR A SEQUENCE | |
my $core_species_genes = 0; ## SAYS WHETHER GENES ARE TAKEN FROM THE CORE SPECIES | |
# OPEN A FILE FOR STORING THE FASTA-FORMAT MULTIPLE ALIGNMENT: | |
open(MFAFILE,">$mfafile") || die "ERROR: make_hmm_from_cigar_alignment: cannot open $mfafile\n"; | |
# READ IN THE CIGAR-FORMAT ALIGNMENTS: | |
open(CIGARS,"$cigarfile") || die "ERROR: read_cigar_alignments: cannot open $cigarfile\n"; | |
while(<CIGARS>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if ($#temp != 3) | |
{ | |
$errormsg = "ERROR: make_hmm_from_cigar_alignment: temp $#temp - line $line in $cigarfile\n"; | |
$errorcode = 4; | |
return($errorcode,$errormsg); | |
} | |
$display_id = $temp[0]; | |
$id = $temp[1]; | |
$cigar = $temp[2]; | |
$sequence = $temp[3]; | |
# CHECK IF THIS IS ONE OF THE TREEFAM 'CLEAN' SPECIES: | |
if ($display_id =~ /_CAEEL/ || $display_id =~ /_CAEBR/ || $display_id =~ /_BRUMA/ || | |
$display_id =~ /_DROME/ || $display_id =~ /_DROPE/ || $display_id =~ /_ANOGA/ || $display_id =~ /_AEDAE/ || | |
$display_id =~ /_SCHMA/ || $display_id =~ /_NEMVE/ || $display_id =~ /_CIOIN/ || | |
$display_id =~ /_HUMAN/ || $display_id =~ /_PANTR/ || $display_id =~ /_MACMU/ || | |
$display_id =~ /_RAT/ || $display_id =~ /_MOUSE/ || $display_id =~ /_BOVIN/ || $display_id =~ /_CANFA/ || | |
$display_id =~ /_MONDO/ || $display_id =~ /_CHICK/ || $display_id =~ /_BRARE/ || $display_id =~ /_XENTR/ || | |
$display_id =~ /_TETNG/ || $display_id =~ /_GASAC/ || $display_id =~ /_YEAST/ || $display_id =~ /_SCHPO/ || | |
$display_id =~ /_ARATH/ || $display_id =~ /_STRPU/ || $display_id =~ /_DICDI/) | |
{ | |
$core_species_genes = 1; | |
# CONVERT THE CIGAR FOR $seq TO A FASTA FORMAT ALIGNMENT: | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq($sequence,$cigar); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
if ($fasta eq 'none') | |
{ | |
print STDERR "WARNING: $family failed (cigar does not match sequence)\n"; | |
return($errorcode,$errormsg); | |
} | |
if ($mfa_line_length eq 'none') { $mfa_line_length = length($fasta);} | |
else | |
{ | |
if ($mfa_line_length != length($fasta)) | |
{ | |
$errormsg = "ERROR: read_cigar_alignments: length of $fasta is not expected length for family $family\n"; | |
$errorcode = 15; # ERRORCODE=15 | |
return($errorcode,$errormsg); | |
} | |
} | |
print MFAFILE ">$id\n"; | |
print MFAFILE "$fasta\n"; | |
} | |
} | |
close(CIGARS); | |
close(MFAFILE); | |
if ($core_species_genes == 0) | |
{ | |
print STDERR "WARNING: $family failed: no core species genes for multiple alignment\n"; | |
} | |
else | |
{ | |
# MAKE A HMM FOR THIS FAMILY: | |
($errorcode,$errormsg) = &make_hmm($mfafile,$hmmer_bin,$output,$map_output); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_hmm | |
sub test_make_hmm | |
{ | |
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $random_number; ## RANDOM NUMBER FOR TEMPORARY FILE NAME | |
my $mfafile; ## FILE FOR WRITING MULTIPLE ALIGNMENT TO | |
my $hmmfile; ## FILE FOR WRITING THE HMM TO | |
$random_number = rand(); | |
$mfafile = $outputdir."/tmp".$random_number; | |
$random_number = rand(); | |
$hmmfile = $outputdir."/tmp".$random_number; | |
open(MFAFILE,">$mfafile") || die "ERROR: test_make_hmm: cannot open $mfafile\n"; | |
print MFAFILE ">seq1\nACGTACC\n"; | |
print MFAFILE ">seq2\n-CCTACC\n"; | |
print MFAFILE ">seq3\nAC-TACC\n"; | |
close(MFAFILE); | |
($errorcode,$errormsg) = &make_hmm($mfafile,"/software/hmmer/bin/",$hmmfile); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_hmm: failed test1\n"; exit;} | |
system "rm -f $mfafile"; | |
system "rm -f $hmmfile"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A HMM FOR THIS FAMILY: | |
sub make_hmm | |
{ | |
my $mfafile = $_[0]; ## FILE CONTAINING THE MULTIPLE ALIGNMENT FOR THE FAMILY | |
my $hmmer_bin = $_[1]; ## DIRECTORY CONTAINING THE HMMER EXECUTABLES | |
my $output = $_[2]; ## NAME OF THE OUTPUT HMM FILE | |
my $map_output = $_[3]; ## NAME OF THE hmmbuild MAP OUTPUT FILE | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my $cmd; ## COMMAND TO RUN | |
# RUN HMMER2 TO MAKE THE HMM: | |
$cmd = $hmmer_bin."/hmmbuild -o $map_output $output $mfafile >& /dev/null"; | |
system "$cmd"; | |
$cmd = $hmmer_bin."/hmmcalibrate $output >& /dev/null"; | |
system "$cmd"; | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &get_fasta_aln_for_seq | |
sub test_get_fasta_aln_for_seq | |
{ | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $fasta; # FASTA FORMAT ALIGNMENT FOR A SEQUENCE | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3D6M"); | |
if ($fasta ne '---ABCDEF' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test1\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","1D6M1D"); | |
if ($fasta ne '-ABCDEF-' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test2\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M3D3M"); | |
if ($fasta ne 'ABC---DEF' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test3\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M3I3M"); | |
if ($errorcode != 14) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test4\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M"); | |
if ($errorcode != 0 || $fasta ne 'none') { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test5\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","20M"); | |
if ($errorcode != 0 || $fasta ne 'none') { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test6\n";} | |
} | |
#------------------------------------------------------------------# | |
# CONVERT THE CIGAR FOR A SEQUENCE TO A FASTA FORMAT ALIGNMENT: | |
# SUBROUTINE SYNOPSIS: get_fasta_aln_for_seq(): converts a cigar-format alignment for a TreeFam family into fasta-format | |
sub get_fasta_aln_for_seq | |
{ | |
my $sequence = $_[0]; # PROTEIN SEQUENCE FOR A TREEFAM GENE | |
my $cigar = $_[1]; # CIGAR FORMAT ALIGNMENT FOR THE SEQUENCE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS "none" IF THERE IS NO ERROR | |
my $fasta = ""; # FASTA FORMAT ALIGNMENT FOR THE SEQUENCE | |
my @tmp; # | |
my @tmp2; # | |
my $i; # | |
my $cigar_type; # TYPE OF ELEMENT IN THE CIGAR STRING | |
my $cigar_len; # LENGTH OF ELEMENT IN THE CIGAR STRING | |
my $j; # | |
my $sequence_index; # INDEX IN THE SEQUENCE $sequence | |
my $aa; # AN AMINO ACID IN SEQUENCE $sequence | |
my $sequence_length; # LENGTH OF $sequence | |
my $last_letter; # LAST LETTER OF $sequence | |
# REMOVE * FROM THE END OF $sequence: | |
if ($sequence eq '') | |
{ | |
$errormsg = "ERROR: get_fasta_aln_for_seq: sequence $sequence\n"; | |
$errorcode = 3; | |
return($fasta,$errorcode,$errormsg); | |
} | |
$last_letter = substr($sequence,length($sequence)-1,1); | |
if ($last_letter eq '*') { chop($sequence);} | |
# REMOVE . FROM THE END OF $sequence (OCCURS FOR SOME SPECIES): | |
$last_letter = substr($sequence,length($sequence)-1,1); | |
if ($last_letter eq '.') { chop($sequence);} | |
@tmp = split(/\d+/,$cigar); | |
@tmp2 = split(/\D+/,$cigar); | |
# LOOK AT EACH ELEMENT IN THE CIGAR STRING | |
$sequence_index = -1; | |
$sequence_length = length($sequence); | |
for ($i = 0; $i <= $#tmp2; $i++) | |
{ | |
$cigar_type = $tmp[($i+1)]; | |
$cigar_len = $tmp2[$i]; | |
if ($cigar_type eq 'D') | |
{ | |
for ($j = 1; $j <= $cigar_len; $j++) | |
{ | |
$fasta = $fasta."-"; | |
} | |
} | |
elsif ($cigar_type eq 'M') | |
{ | |
for ($j = 1; $j <= $cigar_len; $j++) | |
{ | |
$aa = "*"; | |
while($aa eq '*' || $aa eq '.') # IGNORE *s OR .s IN THE SEQUENCE (SOME SEQUENCES HAVE INTERNAL '*' OR '.'): | |
{ | |
$sequence_index++; | |
if ($sequence_index > $sequence_length - 1) | |
{ | |
$fasta = "none"; | |
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: get_fasta_aln_for_seq: sequence_length $sequence_length but sequence_index $sequence_index\n";} | |
return($fasta,$errorcode,$errormsg); | |
} | |
$aa = substr($sequence,$sequence_index,1); | |
} | |
$fasta = $fasta.$aa; | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: get_fasta_aln_for_seq: cigar_type $cigar_type\n"; | |
$errorcode = 14; # ERRORCODE=14 | |
return($fasta,$errorcode,$errormsg); | |
} | |
} | |
if ($sequence_index != $sequence_length - 1) # THE CIGAR ALIGNMENT AND SEQUENCE ARE NOT COMPATIBLE: | |
{ | |
$fasta = 'none'; | |
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: get_fasta_aln_for_seq: sequence_length $sequence_length but sequence_index $sequence_index\n"; } | |
return($fasta,$errorcode,$errormsg); | |
} | |
return($fasta,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# GET THE IDENTIFIERS FOR THE SEQUENCES IN THE FAMILY: | |
# SUBROUTINE SYNOPSIS: get_ids_for_family(): get the identifiers for the sequences in a TreeFam family | |
sub get_ids_for_family | |
{ | |
my $family = $_[0]; ## TREEFAM FAMILY IDENTIFIER | |
my $version = $_[1]; ## VERSION OF THE TREEFAM DATABASE TO USE | |
my $alntype = $_[2]; ## TYPE OF ALIGNMENTS TO USE (SEED/FULL) | |
my $outputdir = $_[3]; ## DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my $database; ## DATABASE TO CONNECT TO | |
my $dbh; ## | |
my $table_w; ## DATABASE TABLE THAT WE WANT TO LOOK AT | |
my $st; ## | |
my $sth; ## | |
my $rv; ## | |
my @array; ## | |
my $id; ## TAXONOMY ID. | |
my $disp_id; ## DISPLAY ID. | |
my $idfile; ## TEMPORARY FILE WITH THE IDENTIFIERS FOR A FAMILY | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY OUTPUT FILE | |
# SORT THE CIGAR FILE BY THE DISPLAY_ID: | |
$random_number = rand(); | |
$idfile = $outputdir."/tmp".$random_number; | |
open(IDFILE,">$idfile") || die "ERROR: get_ids_for_family: cannot open $idfile\n"; | |
# CONNECT TO THE DATABASE: | |
$database = "treefam_".$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# CHECK THAT WE ARE USING EITHER SEED OR FULL ALIGNMENTS: | |
if ($alntype ne 'seed' && $alntype ne 'full') | |
{ | |
$errormsg = "ERROR: get_ids_for_family: alntype $alntype\n"; | |
$errorcode = 16; # ERRORCODE=16 | |
$dbh->disconnect(); | |
return($idfile,$errorcode,$errormsg); | |
} | |
# GET THE SEQUENCES FOR THEFAMILY FROM THE DATABASE: | |
if ($alntype eq 'full') | |
{ | |
$table_w = "aa_full_align"; | |
} | |
else | |
{ | |
$table_w = "aa_seed_align"; | |
} | |
$st = "SELECT ID, DISP_ID from $table_w WHERE AC=\'$family\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; | |
$disp_id = $array[1]; | |
print IDFILE "$disp_id $id\n"; | |
} | |
} | |
else | |
{ | |
print STDERR "WARNING: $family failed (no sequences found)\n"; | |
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: get_ids_for_family: no sequences found for family $family\n";} | |
$dbh->disconnect(); | |
return($idfile,$errorcode,$errormsg); | |
} | |
$dbh->disconnect(); | |
return($idfile,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment