lukem512 · July 12, 2016 10:25
diff --git a/midtermproj.m b/midtermproj.m
 %% Luke Mitchell, April 2016

 %% Question 3

 p.('a') = 1/2;
 p.('c') = 1/6;
 p.('g') = 1/6;
 p.('t') = 1/6;

 seq = 'acgtacgtacgt';

 product = 1;
 for i = 1:length(seq)
    product = product * p.(seq(i));
 end
 product

 %% Question 6

 % Aardvark mitochondrial genome
 mt = getgenbank('Y18475');
 geneticcode = 'Vertebrate Mitochondrial';

 % Count the bases, plot as pie chart
 figure
 basecount(mt, 'chart', 'pie');
 title('Distribution of NT Bases for Aardvark Mitochondrial Genome');

 % Count the dimers, plot as bar chart
 figure
 dimercount(mt, 'chart', 'bar');
 title('Dimer Count for Aardvark Mitochondrial Genome');

 % Plot the nucleotide density
 ntdensity(mt)

 %% Question 7

 % Create a random permutation of the sequence
 randmt = randseq(length(mt.Sequence), 'FROMSTRUCTURE', basecount(mt));

 % Find all ORFs from both random and original sequence
 frames = [1, 2, 3, -1, -2, -3,];
 randorfs = seqshoworfs(randmt,  'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames);
 orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames);

 % Find the lengths of all ORFs
 ORFLength = [];
 randORFLength = [];
 for i = 1:length(frames)
    % Original sequence
    start = orfs(i).Start;
    stop = orfs(i).Stop;
    for j = 1:length(start)
        ORFLength = [ORFLength; stop(j) - start(j)];
    end
    
    % Random sequence
    start = randorfs(i).Start;
    stop = randorfs(i).Stop;
    for j = 1:length(start)
        randORFLength = [randORFLength; stop(j) - start(j)];
    end
 end

 % Compare random and original ORFs
 max_threshold = max(randORFLength)
 n_max = length(find(ORFLength >= max_threshold))

 empirical_threshold = ceil(prctile(randORFLength, 95))
 n_empirical = length(find(ORFLength >= empirical_threshold))

 disp(['Using minimum length of ' num2str(empirical_threshold)])

 % Find the open reading frames
 orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', empirical_threshold, 'FRAMES', frames)

 % Extract the frame data
 for i = 1:length(frames)
    disp(['Frame ' num2str(frames(i))]);
    start = orfs(i).Start;
    stop = orfs(i).Stop;
    for j = 1:length(start)
        disp([num2str(start(j)) ' to ' num2str(stop(j)) ': ' mt.Sequence(start(j):stop(j))])
    end
    disp(' ')
 end

 % Show annotations from genbank
 figure
 [h,l] = featuresmap(mt,{'CDS','tRNA','rRNA','D_loop'},[2 1 2 2 2],'Fontsize',9);
 legend(h,l,'interpreter','none');
 title('Annotated Aardvark Mitochondrial Genome');

 %% Question 8

 % A protein, found in Question 7
 protseq = 'atgtcagtagccctattcgtaacatgatcaattatagagttttcactatgatatatacactcagaccctaacattaaccgcttcttcaaatatttactcttattcctaatcaccataataattctagtaacagctaacaacctacttcagctgtttattgggtgagaaggagtaggaatcatatcatttatactcattggctgatgatcaggacgaactgatgccaatacagctgctctacaggcagttctctataaccgaatcggggacattggctttgttttatcaatagcctgatttatagtccactctaactcatgagaacttcaacaaatttttattacatttccaatagaaaattcactaccccttctaggcctaattctagctgcaacaggtaaatcggcccaatttggacttcatccatgactgccagcagcaatagaaggtcctacaccagtctcagccctactacactccagtactatagttgtagcagggatcttcctacttattcgattccaccctatcctagaaaacaacagcactgccctaacatctatcctatgtatcggcgcaattaccaccctatttacagccctatgtgctcttacgcaaaacgacattaaaaaaatcgtagcattctcaacatcaagtcaattaggcctaataatagtaactattggaatcaatcaaccctacctggccttcatacatatttgcacgcatgccttcttcaaagcaatattatttctatgctcaggctctatcattcatagcctaaatgacgagcaagacatccgaaaaatgggcggcttgtacaaatcccttccatttacaacaacctcactaattgtaggaagcttagcactaaccggaataccattcctaacaggattttattcaaaagacctaattatcgaatcagcaacttcatcctacaccaacgcctgagccctaaccattaccctaatcgctacatccataacagcagtatatagcacccgaattatcttcttcaccctactagggcaaccacgatcaccttcacccatcataattaacgagaataacccagcccttattaactcaatcaaacgtcttaccattggaagtatctttggcggctacctaatctccataaacctacttcctacttcaattccacctataactataccaccatacattaaatatacagcactctcaattacattaattggttttatactagcaatagaactaaactcccttacctacaaccttaaatttaagtaccctgcccaccaatttaaattctctaacatactaggatattacccctctattatacaccgcatccccccattatccttattaactattggtcaaaaactagcaactaccatccttgatctcatctgacttgaaaaagctattccaaaaaacattacatcaataaacctcatattatctatcaaaacctccaaccaaaaaggatcaatcaaactatattttctctctttctttatctccctaatcataggacttttaatctccatt';
 frame = 1;

 % What's the probability that it occured by chance?
 % Using a random sequence model
 pvalue = (61/64)^length(protseq)

 % Using details of codon distribution
 [codons, carray] = codoncount(mt, 'GENETICCODE', geneticcode, 'FRAME', frame);

 count = 0;
 for i = 1:4
    for j = 1:4
        for k = 1:4
            count = count + carray(i,j,k);
        end
    end
 end

 pstop = (codons.TAA / count) + (codons.TAG / count) + (codons.TGA / count)
 pvalue = (1-pstop)^length(protseq)

 %% Question 9

 % Amino acid string
 aa = nt2aa(protseq, 'GENETICCODE', geneticcode, 'FRAME', frame);

 % Retrieve top 5 hits from the BLAST
 ourprot = getgenpept('NP_008585');
 prot1 = getgenpept('YP_220703');
 prot2 = getgenpept('AAL79372');
 prot3 = getgenpept('ACZ29036');
 prot4 = getgenpept('AJA05049');
 prot5 = getgenpept('YP_007626040');

 % Make a multiple alignment with the proteins
 ma  = multialign([ourprot, prot1, prot2, prot3, prot4, prot5]);
 seqalignviewer(ma)

 %% Question 10

 % Homo sapiens mitochondrion, complete genome
 humanmt = getgenbank('NC_012920');

 % Local alignment using Smith-Waterman
 [score, alignment] = swalign(protseq, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62');
 showalignment(alignment);

 % Create a random sequence for comparison
 rands = randseq(length(protseq), 'FROMSTRUCTURE', basecount(protseq));
 [score, alignment] = swalign(rands, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62');
 showalignment(alignment);

 %% Question 11

 % For all coding regions in Human mtDNA
 for i = 1:length(humanmt.CDS)
    idx = humanmt.CDS(i).indices;
    humanreg = humanmt.Sequence(idx(1):idx(2));

    % Score against all ORFs in Aardvark mtDNA
    if (length(humanreg) > 0)
        for j = 1:length(frames)
            start = orfs(j).Start;
            stop = orfs(j).Stop;
            for k = 1:length(start)
                mtreg = mt.Sequence(start(k):stop(k));

                % Score randomly permuted regions against one another
                scores = [];
                numrand = 100;
                for i = 1:numrand
                    randhuman = randseq(length(humanreg), 'FROMSTRUCTURE', basecount(humanreg));
                    randmt = randseq(length(mtreg), 'FROMSTRUCTURE', basecount(mtreg));
                    [score, ~] = swalign(randmt, randhuman, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
                    scores = [scores; score];
                end
                max_random_score = max(scores);
                threshold_score = prctile(scores, 99);
                
                % Score the original sequences
                [score, alignment] = swalign(mtreg, humanreg, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
                
                % figure;
                % histogram(scores);
                % title('Alignment Scores of Randomly Permuted Coding Regions');
                
                if (score > threshold_score)
                    disp(' ');
                    disp(alignment);
                    pval = length(find(scores >= score)) / numrand;
                    disp(['Frame: ' num2str(frames(j)) ', Human: ' num2str(idx(1)) ' to ' num2str(idx(2)) ', Aardvark: ' num2str(start(k)) ' to ' num2str(stop(k)) ', p-value: ' num2str(pval)]);
                end
            end
        end
    end
 end

 %% Question 12

 aa = nt2aa(mt.Sequence,'GENETICCODE', geneticcode);
 humanaa = nt2aa(humanmt.Sequence, 'GENETICCODE', geneticcode, 'ACGTOnly', false);

 % Global alignment using Needleman-Wunsch
 [score, alignment] = nwalign(aa, humanaa, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
 showalignment(alignment);
	%% Luke Mitchell, April 2016

	%% Question 3

	p.('a') = 1/2;
	p.('c') = 1/6;
	p.('g') = 1/6;
	p.('t') = 1/6;

	seq = 'acgtacgtacgt';

	product = 1;
	for i = 1:length(seq)
	product = product * p.(seq(i));
	end
	product

	%% Question 6

	% Aardvark mitochondrial genome
	mt = getgenbank('Y18475');
	geneticcode = 'Vertebrate Mitochondrial';

	% Count the bases, plot as pie chart
	figure
	basecount(mt, 'chart', 'pie');
	title('Distribution of NT Bases for Aardvark Mitochondrial Genome');

	% Count the dimers, plot as bar chart
	figure
	dimercount(mt, 'chart', 'bar');
	title('Dimer Count for Aardvark Mitochondrial Genome');

	% Plot the nucleotide density
	ntdensity(mt)

	%% Question 7

	% Create a random permutation of the sequence
	randmt = randseq(length(mt.Sequence), 'FROMSTRUCTURE', basecount(mt));

	% Find all ORFs from both random and original sequence
	frames = [1, 2, 3, -1, -2, -3,];
	randorfs = seqshoworfs(randmt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames);
	orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames);

	% Find the lengths of all ORFs
	ORFLength = [];
	randORFLength = [];
	for i = 1:length(frames)
	% Original sequence
	start = orfs(i).Start;
	stop = orfs(i).Stop;
	for j = 1:length(start)
	ORFLength = [ORFLength; stop(j) - start(j)];
	end

	% Random sequence
	start = randorfs(i).Start;
	stop = randorfs(i).Stop;
	for j = 1:length(start)
	randORFLength = [randORFLength; stop(j) - start(j)];
	end
	end

	% Compare random and original ORFs
	max_threshold = max(randORFLength)
	n_max = length(find(ORFLength >= max_threshold))

	empirical_threshold = ceil(prctile(randORFLength, 95))
	n_empirical = length(find(ORFLength >= empirical_threshold))

	disp(['Using minimum length of ' num2str(empirical_threshold)])

	% Find the open reading frames
	orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', empirical_threshold, 'FRAMES', frames)

	% Extract the frame data
	for i = 1:length(frames)
	disp(['Frame ' num2str(frames(i))]);
	start = orfs(i).Start;
	stop = orfs(i).Stop;
	for j = 1:length(start)
	disp([num2str(start(j)) ' to ' num2str(stop(j)) ': ' mt.Sequence(start(j):stop(j))])
	end
	disp(' ')
	end

	% Show annotations from genbank
	figure
	[h,l] = featuresmap(mt,{'CDS','tRNA','rRNA','D_loop'},[2 1 2 2 2],'Fontsize',9);
	legend(h,l,'interpreter','none');
	title('Annotated Aardvark Mitochondrial Genome');

	%% Question 8

	% A protein, found in Question 7
	protseq = 'atgtcagtagccctattcgtaacatgatcaattatagagttttcactatgatatatacactcagaccctaacattaaccgcttcttcaaatatttactcttattcctaatcaccataataattctagtaacagctaacaacctacttcagctgtttattgggtgagaaggagtaggaatcatatcatttatactcattggctgatgatcaggacgaactgatgccaatacagctgctctacaggcagttctctataaccgaatcggggacattggctttgttttatcaatagcctgatttatagtccactctaactcatgagaacttcaacaaatttttattacatttccaatagaaaattcactaccccttctaggcctaattctagctgcaacaggtaaatcggcccaatttggacttcatccatgactgccagcagcaatagaaggtcctacaccagtctcagccctactacactccagtactatagttgtagcagggatcttcctacttattcgattccaccctatcctagaaaacaacagcactgccctaacatctatcctatgtatcggcgcaattaccaccctatttacagccctatgtgctcttacgcaaaacgacattaaaaaaatcgtagcattctcaacatcaagtcaattaggcctaataatagtaactattggaatcaatcaaccctacctggccttcatacatatttgcacgcatgccttcttcaaagcaatattatttctatgctcaggctctatcattcatagcctaaatgacgagcaagacatccgaaaaatgggcggcttgtacaaatcccttccatttacaacaacctcactaattgtaggaagcttagcactaaccggaataccattcctaacaggattttattcaaaagacctaattatcgaatcagcaacttcatcctacaccaacgcctgagccctaaccattaccctaatcgctacatccataacagcagtatatagcacccgaattatcttcttcaccctactagggcaaccacgatcaccttcacccatcataattaacgagaataacccagcccttattaactcaatcaaacgtcttaccattggaagtatctttggcggctacctaatctccataaacctacttcctacttcaattccacctataactataccaccatacattaaatatacagcactctcaattacattaattggttttatactagcaatagaactaaactcccttacctacaaccttaaatttaagtaccctgcccaccaatttaaattctctaacatactaggatattacccctctattatacaccgcatccccccattatccttattaactattggtcaaaaactagcaactaccatccttgatctcatctgacttgaaaaagctattccaaaaaacattacatcaataaacctcatattatctatcaaaacctccaaccaaaaaggatcaatcaaactatattttctctctttctttatctccctaatcataggacttttaatctccatt';
	frame = 1;

	% What's the probability that it occured by chance?
	% Using a random sequence model
	pvalue = (61/64)^length(protseq)

	% Using details of codon distribution
	[codons, carray] = codoncount(mt, 'GENETICCODE', geneticcode, 'FRAME', frame);

	count = 0;
	for i = 1:4
	for j = 1:4
	for k = 1:4
	count = count + carray(i,j,k);
	end
	end
	end

	pstop = (codons.TAA / count) + (codons.TAG / count) + (codons.TGA / count)
	pvalue = (1-pstop)^length(protseq)

	%% Question 9

	% Amino acid string
	aa = nt2aa(protseq, 'GENETICCODE', geneticcode, 'FRAME', frame);

	% Retrieve top 5 hits from the BLAST
	ourprot = getgenpept('NP_008585');
	prot1 = getgenpept('YP_220703');
	prot2 = getgenpept('AAL79372');
	prot3 = getgenpept('ACZ29036');
	prot4 = getgenpept('AJA05049');
	prot5 = getgenpept('YP_007626040');

	% Make a multiple alignment with the proteins
	ma = multialign([ourprot, prot1, prot2, prot3, prot4, prot5]);
	seqalignviewer(ma)

	%% Question 10

	% Homo sapiens mitochondrion, complete genome
	humanmt = getgenbank('NC_012920');

	% Local alignment using Smith-Waterman
	[score, alignment] = swalign(protseq, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62');
	showalignment(alignment);

	% Create a random sequence for comparison
	rands = randseq(length(protseq), 'FROMSTRUCTURE', basecount(protseq));
	[score, alignment] = swalign(rands, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62');
	showalignment(alignment);

	%% Question 11

	% For all coding regions in Human mtDNA
	for i = 1:length(humanmt.CDS)
	idx = humanmt.CDS(i).indices;
	humanreg = humanmt.Sequence(idx(1):idx(2));

	% Score against all ORFs in Aardvark mtDNA
	if (length(humanreg) > 0)
	for j = 1:length(frames)
	start = orfs(j).Start;
	stop = orfs(j).Stop;
	for k = 1:length(start)
	mtreg = mt.Sequence(start(k):stop(k));

	% Score randomly permuted regions against one another
	scores = [];
	numrand = 100;
	for i = 1:numrand
	randhuman = randseq(length(humanreg), 'FROMSTRUCTURE', basecount(humanreg));
	randmt = randseq(length(mtreg), 'FROMSTRUCTURE', basecount(mtreg));
	[score, ~] = swalign(randmt, randhuman, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
	scores = [scores; score];
	end
	max_random_score = max(scores);
	threshold_score = prctile(scores, 99);

	% Score the original sequences
	[score, alignment] = swalign(mtreg, humanreg, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');

	% figure;
	% histogram(scores);
	% title('Alignment Scores of Randomly Permuted Coding Regions');

	if (score > threshold_score)
	disp(' ');
	disp(alignment);
	pval = length(find(scores >= score)) / numrand;
	disp(['Frame: ' num2str(frames(j)) ', Human: ' num2str(idx(1)) ' to ' num2str(idx(2)) ', Aardvark: ' num2str(start(k)) ' to ' num2str(stop(k)) ', p-value: ' num2str(pval)]);
	end
	end
	end
	end
	end

	%% Question 12

	aa = nt2aa(mt.Sequence,'GENETICCODE', geneticcode);
	humanaa = nt2aa(humanmt.Sequence, 'GENETICCODE', geneticcode, 'ACGTOnly', false);

	% Global alignment using Needleman-Wunsch
	[score, alignment] = nwalign(aa, humanaa, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
	showalignment(alignment);