sahrendt0 · October 9, 2012 22:01
diff --git a/ps2a.pl b/ps2a.pl
 #!/usr/bin/perl -w
 # Script: ps2a.pl
 # Description: GEN220 homework set 2, questions 1-4
 # Author: Steven Ahrendt
 # email: [email protected]
 # Due: 10.9.12
 ##############################
 # [x]: Q1 (length of sequence & num of codons)
 # [x]: Q2 (first start codon & length of 5' UTR)
 # [x]: Q3 (first stop codon)
 # [x]: Q4 (length of: CDS, predicted protein, & 3' UTR)
 ################################

 use strict;

 my $infile = shift;

 ## Read in and process sequence
 open(FAS,"<$infile") or die "Can't open file \"$infile\"...\n";
 my $seqlen = 0;
 my $sequence = "";
 foreach my $line (<FAS>)
 {
  next if ($line =~ m/^>/);
  chomp $line;
  $sequence = join("",$sequence,$line);
 }
 close(FAS);
 $seqlen = length($sequence);
 print "This sequence is of length $seqlen bp and there are ".($seqlen/3)." codons.\n"; # Answer to Q1

 ## Q2/3/4: CDS and UTRs
 my $ind = index($sequence, "ATG");
 my $start = $ind; # index of start codon
 my $stop = -1;    # index of stop codon (-1 for now)
 while($ind < length($sequence))
 {
  my $codon = substr($sequence,$ind,3);
  if(($codon eq "TAA") || ($codon eq "TAG") || ($codon eq "TGA"))
  {
    $stop = ($ind+1); # +1 offset because sequences start at 1, not 0
    last;
  }
  $ind = $ind+3;
 }
 print "The first start codon is at basepair ".($start+1).", and the in-frame stop codon is at basepair $stop.\n";
 my $UTR5 = $start; # In this case, index of start codon also gives length of 5' UTR
 my $UTR3 = $seqlen-($stop+3)+1; # +3 offset because 3' UTR does not contain stop codon
 print "The 5'UTR is $UTR5 bp long and the 3' UTR is $UTR3 bp long.\n";
 my $CDS = $seqlen-$UTR5-$UTR3;
 print "The CDS length is $CDS bp and the predicted protein length is ".(($CDS-3)/3).".\n"; # -3 offset because protein does not contain stop codon
diff --git a/ps2b.pl b/ps2b.pl
 #!/usr/bin/perl -w
 # Script: ps2b.pl
 # Description: GEN220 homework set 2, questions  5-7
 # Author: Steven Ahrendt
 # email: [email protected]
 # Due: 10.9.12
 ##############################
 # [x]: Q5 (alignment hash: name & length of seqs)
 # [x]: Q6 (num of gapped columns in alignment)
 # [x]: Q7 (identical residues b/w two seqs)
 ################################

 use strict;

 ## Q5/6/7: Alignment
 my %alignment;
 $alignment{AAC35278} =   "LLIAITYYNEDKVLTARTLHGVMQNPAWQKIVVCLVFDGIDPVLATIGV-VMKKDVDGKE";
 $alignment{AnCSMA} =     "AMCLVTCYSEGEEGIRTTLDSIALTPN-SHKSIVVICDGIIKVLRMMRD-TGSKRHNMAK";
 $alignment{AfCHSF} =     "ALCLVTCYSEGEEGIRTTLDSIAMTPN-SHKTIIVICDGIIKVLRMMRD-TGSKRHNMAK";
 $alignment{AAF19527} =   "AILLVTAYSEGELGIRTTLDSIATTPN-SHKTILVICDGIIKVLGMMKD-RGSKRHNMAK";
 $alignment{"P30573-1"} = "TINLVTCYSEDEEGIRITLDSIATTPN-SHKLILVICDGIIKVLDMMSDAQGSKRHNMAK";

 ## Loop through the sequences in the hash
 #   For each sequence, get the number of residues and the number of gaps
 my $num_gaps = 0; # Total number of gapped columns
 my @gap_columns;  # Store the indices of the columns with gaps
 foreach my $key (keys %alignment)
 {
  my @cols = split("",$alignment{$key});
  my $gaps = 0; # Number of gaps in this sequence
  for(my $i =0; $i < scalar @cols; $i++)
  {
    if($cols[$i] eq "-")
    {
      $gaps++;
      my $present = 0;
      foreach my $index (@gap_columns)
      {
        if($index == $i){$present = 1;} # If the index exists in the array
      }
      if(!$present){push(@gap_columns,$i);}
    }
  }
  print $key," ",length($alignment{$key})-$gaps,"\n"; # Answer to Q5
 }
 print "The number of gapped columns in this alignment is ".scalar(@gap_columns).".\n"; # Answer to Q6

 ## Nested loops through the hash
 #   Compare each sequence to the one after it (pairwise)
 my @a_keys = keys %alignment;
 for(my $fc = 0; $fc < scalar(@a_keys); $fc++) # fc = first counter
 {
  for(my $sc = $fc+1; $sc < scalar(@a_keys); $sc++) # sc = second counter
  {
    my @first = split("",$alignment{$a_keys[$fc]});  # First sequence 
    my @second = split("",$alignment{$a_keys[$sc]}); # Second sequence
    my $pos = 0;                                     # Position in the sequence
    my $ident = 0;                                   # Num of identical residues
    while ($pos < scalar @first)
    {
      if($first[$pos] eq $second[$pos]){$ident++;}
      $pos++;
    }
    print "Identical residues between $a_keys[$fc] and $a_keys[$sc] is $ident.\n"; # Answer to Q7
  }
 }
	#!/usr/bin/perl -w
	# Script: ps2a.pl
	# Description: GEN220 homework set 2, questions 1-4
	# Author: Steven Ahrendt
	# email: [email protected]
	# Due: 10.9.12
	##############################
	# [x]: Q1 (length of sequence & num of codons)
	# [x]: Q2 (first start codon & length of 5' UTR)
	# [x]: Q3 (first stop codon)
	# [x]: Q4 (length of: CDS, predicted protein, & 3' UTR)
	################################

	use strict;

	my $infile = shift;

	## Read in and process sequence
	open(FAS,"<$infile") or die "Can't open file \"$infile\"...\n";
	my $seqlen = 0;
	my $sequence = "";
	foreach my $line (<FAS>)
	{
	next if ($line =~ m/^>/);
	chomp $line;
	$sequence = join("",$sequence,$line);
	}
	close(FAS);
	$seqlen = length($sequence);
	print "This sequence is of length $seqlen bp and there are ".($seqlen/3)." codons.\n"; # Answer to Q1

	## Q2/3/4: CDS and UTRs
	my $ind = index($sequence, "ATG");
	my $start = $ind; # index of start codon
	my $stop = -1; # index of stop codon (-1 for now)
	while($ind < length($sequence))
	{
	my $codon = substr($sequence,$ind,3);
	if(($codon eq "TAA") \|\| ($codon eq "TAG") \|\| ($codon eq "TGA"))
	{
	$stop = ($ind+1); # +1 offset because sequences start at 1, not 0
	last;
	}
	$ind = $ind+3;
	}
	print "The first start codon is at basepair ".($start+1).", and the in-frame stop codon is at basepair $stop.\n";
	my $UTR5 = $start; # In this case, index of start codon also gives length of 5' UTR
	my $UTR3 = $seqlen-($stop+3)+1; # +3 offset because 3' UTR does not contain stop codon
	print "The 5'UTR is $UTR5 bp long and the 3' UTR is $UTR3 bp long.\n";
	my $CDS = $seqlen-$UTR5-$UTR3;
	print "The CDS length is $CDS bp and the predicted protein length is ".(($CDS-3)/3).".\n"; # -3 offset because protein does not contain stop codon
	#!/usr/bin/perl -w
	# Script: ps2b.pl
	# Description: GEN220 homework set 2, questions 5-7
	# Author: Steven Ahrendt
	# email: [email protected]
	# Due: 10.9.12
	##############################
	# [x]: Q5 (alignment hash: name & length of seqs)
	# [x]: Q6 (num of gapped columns in alignment)
	# [x]: Q7 (identical residues b/w two seqs)
	################################

	use strict;

	## Q5/6/7: Alignment
	my %alignment;
	$alignment{AAC35278} = "LLIAITYYNEDKVLTARTLHGVMQNPAWQKIVVCLVFDGIDPVLATIGV-VMKKDVDGKE";
	$alignment{AnCSMA} = "AMCLVTCYSEGEEGIRTTLDSIALTPN-SHKSIVVICDGIIKVLRMMRD-TGSKRHNMAK";
	$alignment{AfCHSF} = "ALCLVTCYSEGEEGIRTTLDSIAMTPN-SHKTIIVICDGIIKVLRMMRD-TGSKRHNMAK";
	$alignment{AAF19527} = "AILLVTAYSEGELGIRTTLDSIATTPN-SHKTILVICDGIIKVLGMMKD-RGSKRHNMAK";
	$alignment{"P30573-1"} = "TINLVTCYSEDEEGIRITLDSIATTPN-SHKLILVICDGIIKVLDMMSDAQGSKRHNMAK";

	## Loop through the sequences in the hash
	# For each sequence, get the number of residues and the number of gaps
	my $num_gaps = 0; # Total number of gapped columns
	my @gap_columns; # Store the indices of the columns with gaps
	foreach my $key (keys %alignment)
	{
	my @cols = split("",$alignment{$key});
	my $gaps = 0; # Number of gaps in this sequence
	for(my $i =0; $i < scalar @cols; $i++)
	{
	if($cols[$i] eq "-")
	{
	$gaps++;
	my $present = 0;
	foreach my $index (@gap_columns)
	{
	if($index == $i){$present = 1;} # If the index exists in the array
	}
	if(!$present){push(@gap_columns,$i);}
	}
	}
	print $key," ",length($alignment{$key})-$gaps,"\n"; # Answer to Q5
	}
	print "The number of gapped columns in this alignment is ".scalar(@gap_columns).".\n"; # Answer to Q6

	## Nested loops through the hash
	# Compare each sequence to the one after it (pairwise)
	my @a_keys = keys %alignment;
	for(my $fc = 0; $fc < scalar(@a_keys); $fc++) # fc = first counter
	{
	for(my $sc = $fc+1; $sc < scalar(@a_keys); $sc++) # sc = second counter
	{
	my @first = split("",$alignment{$a_keys[$fc]}); # First sequence
	my @second = split("",$alignment{$a_keys[$sc]}); # Second sequence
	my $pos = 0; # Position in the sequence
	my $ident = 0; # Num of identical residues
	while ($pos < scalar @first)
	{
	if($first[$pos] eq $second[$pos]){$ident++;}
	$pos++;
	}
	print "Identical residues between $a_keys[$fc] and $a_keys[$sc] is $ident.\n"; # Answer to Q7
	}
	}