Last active
July 12, 2016 10:25
-
-
Save lukem512/4081a524519d5571966f324c33dc857f to your computer and use it in GitHub Desktop.
Genetic Algorithms: Mid-term Project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% Luke Mitchell, April 2016 | |
%% Question 3 | |
p.('a') = 1/2; | |
p.('c') = 1/6; | |
p.('g') = 1/6; | |
p.('t') = 1/6; | |
seq = 'acgtacgtacgt'; | |
product = 1; | |
for i = 1:length(seq) | |
product = product * p.(seq(i)); | |
end | |
product | |
%% Question 6 | |
% Aardvark mitochondrial genome | |
mt = getgenbank('Y18475'); | |
geneticcode = 'Vertebrate Mitochondrial'; | |
% Count the bases, plot as pie chart | |
figure | |
basecount(mt, 'chart', 'pie'); | |
title('Distribution of NT Bases for Aardvark Mitochondrial Genome'); | |
% Count the dimers, plot as bar chart | |
figure | |
dimercount(mt, 'chart', 'bar'); | |
title('Dimer Count for Aardvark Mitochondrial Genome'); | |
% Plot the nucleotide density | |
ntdensity(mt) | |
%% Question 7 | |
% Create a random permutation of the sequence | |
randmt = randseq(length(mt.Sequence), 'FROMSTRUCTURE', basecount(mt)); | |
% Find all ORFs from both random and original sequence | |
frames = [1, 2, 3, -1, -2, -3,]; | |
randorfs = seqshoworfs(randmt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames); | |
orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames); | |
% Find the lengths of all ORFs | |
ORFLength = []; | |
randORFLength = []; | |
for i = 1:length(frames) | |
% Original sequence | |
start = orfs(i).Start; | |
stop = orfs(i).Stop; | |
for j = 1:length(start) | |
ORFLength = [ORFLength; stop(j) - start(j)]; | |
end | |
% Random sequence | |
start = randorfs(i).Start; | |
stop = randorfs(i).Stop; | |
for j = 1:length(start) | |
randORFLength = [randORFLength; stop(j) - start(j)]; | |
end | |
end | |
% Compare random and original ORFs | |
max_threshold = max(randORFLength) | |
n_max = length(find(ORFLength >= max_threshold)) | |
empirical_threshold = ceil(prctile(randORFLength, 95)) | |
n_empirical = length(find(ORFLength >= empirical_threshold)) | |
disp(['Using minimum length of ' num2str(empirical_threshold)]) | |
% Find the open reading frames | |
orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', empirical_threshold, 'FRAMES', frames) | |
% Extract the frame data | |
for i = 1:length(frames) | |
disp(['Frame ' num2str(frames(i))]); | |
start = orfs(i).Start; | |
stop = orfs(i).Stop; | |
for j = 1:length(start) | |
disp([num2str(start(j)) ' to ' num2str(stop(j)) ': ' mt.Sequence(start(j):stop(j))]) | |
end | |
disp(' ') | |
end | |
% Show annotations from genbank | |
figure | |
[h,l] = featuresmap(mt,{'CDS','tRNA','rRNA','D_loop'},[2 1 2 2 2],'Fontsize',9); | |
legend(h,l,'interpreter','none'); | |
title('Annotated Aardvark Mitochondrial Genome'); | |
%% Question 8 | |
% A protein, found in Question 7 | |
protseq = 'atgtcagtagccctattcgtaacatgatcaattatagagttttcactatgatatatacactcagaccctaacattaaccgcttcttcaaatatttactcttattcctaatcaccataataattctagtaacagctaacaacctacttcagctgtttattgggtgagaaggagtaggaatcatatcatttatactcattggctgatgatcaggacgaactgatgccaatacagctgctctacaggcagttctctataaccgaatcggggacattggctttgttttatcaatagcctgatttatagtccactctaactcatgagaacttcaacaaatttttattacatttccaatagaaaattcactaccccttctaggcctaattctagctgcaacaggtaaatcggcccaatttggacttcatccatgactgccagcagcaatagaaggtcctacaccagtctcagccctactacactccagtactatagttgtagcagggatcttcctacttattcgattccaccctatcctagaaaacaacagcactgccctaacatctatcctatgtatcggcgcaattaccaccctatttacagccctatgtgctcttacgcaaaacgacattaaaaaaatcgtagcattctcaacatcaagtcaattaggcctaataatagtaactattggaatcaatcaaccctacctggccttcatacatatttgcacgcatgccttcttcaaagcaatattatttctatgctcaggctctatcattcatagcctaaatgacgagcaagacatccgaaaaatgggcggcttgtacaaatcccttccatttacaacaacctcactaattgtaggaagcttagcactaaccggaataccattcctaacaggattttattcaaaagacctaattatcgaatcagcaacttcatcctacaccaacgcctgagccctaaccattaccctaatcgctacatccataacagcagtatatagcacccgaattatcttcttcaccctactagggcaaccacgatcaccttcacccatcataattaacgagaataacccagcccttattaactcaatcaaacgtcttaccattggaagtatctttggcggctacctaatctccataaacctacttcctacttcaattccacctataactataccaccatacattaaatatacagcactctcaattacattaattggttttatactagcaatagaactaaactcccttacctacaaccttaaatttaagtaccctgcccaccaatttaaattctctaacatactaggatattacccctctattatacaccgcatccccccattatccttattaactattggtcaaaaactagcaactaccatccttgatctcatctgacttgaaaaagctattccaaaaaacattacatcaataaacctcatattatctatcaaaacctccaaccaaaaaggatcaatcaaactatattttctctctttctttatctccctaatcataggacttttaatctccatt'; | |
frame = 1; | |
% What's the probability that it occured by chance? | |
% Using a random sequence model | |
pvalue = (61/64)^length(protseq) | |
% Using details of codon distribution | |
[codons, carray] = codoncount(mt, 'GENETICCODE', geneticcode, 'FRAME', frame); | |
count = 0; | |
for i = 1:4 | |
for j = 1:4 | |
for k = 1:4 | |
count = count + carray(i,j,k); | |
end | |
end | |
end | |
pstop = (codons.TAA / count) + (codons.TAG / count) + (codons.TGA / count) | |
pvalue = (1-pstop)^length(protseq) | |
%% Question 9 | |
% Amino acid string | |
aa = nt2aa(protseq, 'GENETICCODE', geneticcode, 'FRAME', frame); | |
% Retrieve top 5 hits from the BLAST | |
ourprot = getgenpept('NP_008585'); | |
prot1 = getgenpept('YP_220703'); | |
prot2 = getgenpept('AAL79372'); | |
prot3 = getgenpept('ACZ29036'); | |
prot4 = getgenpept('AJA05049'); | |
prot5 = getgenpept('YP_007626040'); | |
% Make a multiple alignment with the proteins | |
ma = multialign([ourprot, prot1, prot2, prot3, prot4, prot5]); | |
seqalignviewer(ma) | |
%% Question 10 | |
% Homo sapiens mitochondrion, complete genome | |
humanmt = getgenbank('NC_012920'); | |
% Local alignment using Smith-Waterman | |
[score, alignment] = swalign(protseq, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62'); | |
showalignment(alignment); | |
% Create a random sequence for comparison | |
rands = randseq(length(protseq), 'FROMSTRUCTURE', basecount(protseq)); | |
[score, alignment] = swalign(rands, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62'); | |
showalignment(alignment); | |
%% Question 11 | |
% For all coding regions in Human mtDNA | |
for i = 1:length(humanmt.CDS) | |
idx = humanmt.CDS(i).indices; | |
humanreg = humanmt.Sequence(idx(1):idx(2)); | |
% Score against all ORFs in Aardvark mtDNA | |
if (length(humanreg) > 0) | |
for j = 1:length(frames) | |
start = orfs(j).Start; | |
stop = orfs(j).Stop; | |
for k = 1:length(start) | |
mtreg = mt.Sequence(start(k):stop(k)); | |
% Score randomly permuted regions against one another | |
scores = []; | |
numrand = 100; | |
for i = 1:numrand | |
randhuman = randseq(length(humanreg), 'FROMSTRUCTURE', basecount(humanreg)); | |
randmt = randseq(length(mtreg), 'FROMSTRUCTURE', basecount(mtreg)); | |
[score, ~] = swalign(randmt, randhuman, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62'); | |
scores = [scores; score]; | |
end | |
max_random_score = max(scores); | |
threshold_score = prctile(scores, 99); | |
% Score the original sequences | |
[score, alignment] = swalign(mtreg, humanreg, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62'); | |
% figure; | |
% histogram(scores); | |
% title('Alignment Scores of Randomly Permuted Coding Regions'); | |
if (score > threshold_score) | |
disp(' '); | |
disp(alignment); | |
pval = length(find(scores >= score)) / numrand; | |
disp(['Frame: ' num2str(frames(j)) ', Human: ' num2str(idx(1)) ' to ' num2str(idx(2)) ', Aardvark: ' num2str(start(k)) ' to ' num2str(stop(k)) ', p-value: ' num2str(pval)]); | |
end | |
end | |
end | |
end | |
end | |
%% Question 12 | |
aa = nt2aa(mt.Sequence,'GENETICCODE', geneticcode); | |
humanaa = nt2aa(humanmt.Sequence, 'GENETICCODE', geneticcode, 'ACGTOnly', false); | |
% Global alignment using Needleman-Wunsch | |
[score, alignment] = nwalign(aa, humanaa, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62'); | |
showalignment(alignment); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment