Created
October 9, 2012 22:01
-
-
Save sahrendt0/3861709 to your computer and use it in GitHub Desktop.
GEN 220 Problem Set 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# Script: ps2a.pl | |
# Description: GEN220 homework set 2, questions 1-4 | |
# Author: Steven Ahrendt | |
# email: [email protected] | |
# Due: 10.9.12 | |
############################## | |
# [x]: Q1 (length of sequence & num of codons) | |
# [x]: Q2 (first start codon & length of 5' UTR) | |
# [x]: Q3 (first stop codon) | |
# [x]: Q4 (length of: CDS, predicted protein, & 3' UTR) | |
################################ | |
use strict; | |
my $infile = shift; | |
## Read in and process sequence | |
open(FAS,"<$infile") or die "Can't open file \"$infile\"...\n"; | |
my $seqlen = 0; | |
my $sequence = ""; | |
foreach my $line (<FAS>) | |
{ | |
next if ($line =~ m/^>/); | |
chomp $line; | |
$sequence = join("",$sequence,$line); | |
} | |
close(FAS); | |
$seqlen = length($sequence); | |
print "This sequence is of length $seqlen bp and there are ".($seqlen/3)." codons.\n"; # Answer to Q1 | |
## Q2/3/4: CDS and UTRs | |
my $ind = index($sequence, "ATG"); | |
my $start = $ind; # index of start codon | |
my $stop = -1; # index of stop codon (-1 for now) | |
while($ind < length($sequence)) | |
{ | |
my $codon = substr($sequence,$ind,3); | |
if(($codon eq "TAA") || ($codon eq "TAG") || ($codon eq "TGA")) | |
{ | |
$stop = ($ind+1); # +1 offset because sequences start at 1, not 0 | |
last; | |
} | |
$ind = $ind+3; | |
} | |
print "The first start codon is at basepair ".($start+1).", and the in-frame stop codon is at basepair $stop.\n"; | |
my $UTR5 = $start; # In this case, index of start codon also gives length of 5' UTR | |
my $UTR3 = $seqlen-($stop+3)+1; # +3 offset because 3' UTR does not contain stop codon | |
print "The 5'UTR is $UTR5 bp long and the 3' UTR is $UTR3 bp long.\n"; | |
my $CDS = $seqlen-$UTR5-$UTR3; | |
print "The CDS length is $CDS bp and the predicted protein length is ".(($CDS-3)/3).".\n"; # -3 offset because protein does not contain stop codon |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# Script: ps2b.pl | |
# Description: GEN220 homework set 2, questions 5-7 | |
# Author: Steven Ahrendt | |
# email: [email protected] | |
# Due: 10.9.12 | |
############################## | |
# [x]: Q5 (alignment hash: name & length of seqs) | |
# [x]: Q6 (num of gapped columns in alignment) | |
# [x]: Q7 (identical residues b/w two seqs) | |
################################ | |
use strict; | |
## Q5/6/7: Alignment | |
my %alignment; | |
$alignment{AAC35278} = "LLIAITYYNEDKVLTARTLHGVMQNPAWQKIVVCLVFDGIDPVLATIGV-VMKKDVDGKE"; | |
$alignment{AnCSMA} = "AMCLVTCYSEGEEGIRTTLDSIALTPN-SHKSIVVICDGIIKVLRMMRD-TGSKRHNMAK"; | |
$alignment{AfCHSF} = "ALCLVTCYSEGEEGIRTTLDSIAMTPN-SHKTIIVICDGIIKVLRMMRD-TGSKRHNMAK"; | |
$alignment{AAF19527} = "AILLVTAYSEGELGIRTTLDSIATTPN-SHKTILVICDGIIKVLGMMKD-RGSKRHNMAK"; | |
$alignment{"P30573-1"} = "TINLVTCYSEDEEGIRITLDSIATTPN-SHKLILVICDGIIKVLDMMSDAQGSKRHNMAK"; | |
## Loop through the sequences in the hash | |
# For each sequence, get the number of residues and the number of gaps | |
my $num_gaps = 0; # Total number of gapped columns | |
my @gap_columns; # Store the indices of the columns with gaps | |
foreach my $key (keys %alignment) | |
{ | |
my @cols = split("",$alignment{$key}); | |
my $gaps = 0; # Number of gaps in this sequence | |
for(my $i =0; $i < scalar @cols; $i++) | |
{ | |
if($cols[$i] eq "-") | |
{ | |
$gaps++; | |
my $present = 0; | |
foreach my $index (@gap_columns) | |
{ | |
if($index == $i){$present = 1;} # If the index exists in the array | |
} | |
if(!$present){push(@gap_columns,$i);} | |
} | |
} | |
print $key," ",length($alignment{$key})-$gaps,"\n"; # Answer to Q5 | |
} | |
print "The number of gapped columns in this alignment is ".scalar(@gap_columns).".\n"; # Answer to Q6 | |
## Nested loops through the hash | |
# Compare each sequence to the one after it (pairwise) | |
my @a_keys = keys %alignment; | |
for(my $fc = 0; $fc < scalar(@a_keys); $fc++) # fc = first counter | |
{ | |
for(my $sc = $fc+1; $sc < scalar(@a_keys); $sc++) # sc = second counter | |
{ | |
my @first = split("",$alignment{$a_keys[$fc]}); # First sequence | |
my @second = split("",$alignment{$a_keys[$sc]}); # Second sequence | |
my $pos = 0; # Position in the sequence | |
my $ident = 0; # Num of identical residues | |
while ($pos < scalar @first) | |
{ | |
if($first[$pos] eq $second[$pos]){$ident++;} | |
$pos++; | |
} | |
print "Identical residues between $a_keys[$fc] and $a_keys[$sc] is $ident.\n"; # Answer to Q7 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment