Skip to content

Instantly share code, notes, and snippets.

@lukem512
Last active July 12, 2016 10:25
Show Gist options
  • Save lukem512/4081a524519d5571966f324c33dc857f to your computer and use it in GitHub Desktop.
Save lukem512/4081a524519d5571966f324c33dc857f to your computer and use it in GitHub Desktop.
Genetic Algorithms: Mid-term Project
%% Luke Mitchell, April 2016
%% Question 3
p.('a') = 1/2;
p.('c') = 1/6;
p.('g') = 1/6;
p.('t') = 1/6;
seq = 'acgtacgtacgt';
product = 1;
for i = 1:length(seq)
product = product * p.(seq(i));
end
product
%% Question 6
% Aardvark mitochondrial genome
mt = getgenbank('Y18475');
geneticcode = 'Vertebrate Mitochondrial';
% Count the bases, plot as pie chart
figure
basecount(mt, 'chart', 'pie');
title('Distribution of NT Bases for Aardvark Mitochondrial Genome');
% Count the dimers, plot as bar chart
figure
dimercount(mt, 'chart', 'bar');
title('Dimer Count for Aardvark Mitochondrial Genome');
% Plot the nucleotide density
ntdensity(mt)
%% Question 7
% Create a random permutation of the sequence
randmt = randseq(length(mt.Sequence), 'FROMSTRUCTURE', basecount(mt));
% Find all ORFs from both random and original sequence
frames = [1, 2, 3, -1, -2, -3,];
randorfs = seqshoworfs(randmt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames);
orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', 1, 'FRAMES', frames);
% Find the lengths of all ORFs
ORFLength = [];
randORFLength = [];
for i = 1:length(frames)
% Original sequence
start = orfs(i).Start;
stop = orfs(i).Stop;
for j = 1:length(start)
ORFLength = [ORFLength; stop(j) - start(j)];
end
% Random sequence
start = randorfs(i).Start;
stop = randorfs(i).Stop;
for j = 1:length(start)
randORFLength = [randORFLength; stop(j) - start(j)];
end
end
% Compare random and original ORFs
max_threshold = max(randORFLength)
n_max = length(find(ORFLength >= max_threshold))
empirical_threshold = ceil(prctile(randORFLength, 95))
n_empirical = length(find(ORFLength >= empirical_threshold))
disp(['Using minimum length of ' num2str(empirical_threshold)])
% Find the open reading frames
orfs = seqshoworfs(mt, 'GENETICCODE', geneticcode, 'MINIMUMLENGTH', empirical_threshold, 'FRAMES', frames)
% Extract the frame data
for i = 1:length(frames)
disp(['Frame ' num2str(frames(i))]);
start = orfs(i).Start;
stop = orfs(i).Stop;
for j = 1:length(start)
disp([num2str(start(j)) ' to ' num2str(stop(j)) ': ' mt.Sequence(start(j):stop(j))])
end
disp(' ')
end
% Show annotations from genbank
figure
[h,l] = featuresmap(mt,{'CDS','tRNA','rRNA','D_loop'},[2 1 2 2 2],'Fontsize',9);
legend(h,l,'interpreter','none');
title('Annotated Aardvark Mitochondrial Genome');
%% Question 8
% A protein, found in Question 7
protseq = 'atgtcagtagccctattcgtaacatgatcaattatagagttttcactatgatatatacactcagaccctaacattaaccgcttcttcaaatatttactcttattcctaatcaccataataattctagtaacagctaacaacctacttcagctgtttattgggtgagaaggagtaggaatcatatcatttatactcattggctgatgatcaggacgaactgatgccaatacagctgctctacaggcagttctctataaccgaatcggggacattggctttgttttatcaatagcctgatttatagtccactctaactcatgagaacttcaacaaatttttattacatttccaatagaaaattcactaccccttctaggcctaattctagctgcaacaggtaaatcggcccaatttggacttcatccatgactgccagcagcaatagaaggtcctacaccagtctcagccctactacactccagtactatagttgtagcagggatcttcctacttattcgattccaccctatcctagaaaacaacagcactgccctaacatctatcctatgtatcggcgcaattaccaccctatttacagccctatgtgctcttacgcaaaacgacattaaaaaaatcgtagcattctcaacatcaagtcaattaggcctaataatagtaactattggaatcaatcaaccctacctggccttcatacatatttgcacgcatgccttcttcaaagcaatattatttctatgctcaggctctatcattcatagcctaaatgacgagcaagacatccgaaaaatgggcggcttgtacaaatcccttccatttacaacaacctcactaattgtaggaagcttagcactaaccggaataccattcctaacaggattttattcaaaagacctaattatcgaatcagcaacttcatcctacaccaacgcctgagccctaaccattaccctaatcgctacatccataacagcagtatatagcacccgaattatcttcttcaccctactagggcaaccacgatcaccttcacccatcataattaacgagaataacccagcccttattaactcaatcaaacgtcttaccattggaagtatctttggcggctacctaatctccataaacctacttcctacttcaattccacctataactataccaccatacattaaatatacagcactctcaattacattaattggttttatactagcaatagaactaaactcccttacctacaaccttaaatttaagtaccctgcccaccaatttaaattctctaacatactaggatattacccctctattatacaccgcatccccccattatccttattaactattggtcaaaaactagcaactaccatccttgatctcatctgacttgaaaaagctattccaaaaaacattacatcaataaacctcatattatctatcaaaacctccaaccaaaaaggatcaatcaaactatattttctctctttctttatctccctaatcataggacttttaatctccatt';
frame = 1;
% What's the probability that it occured by chance?
% Using a random sequence model
pvalue = (61/64)^length(protseq)
% Using details of codon distribution
[codons, carray] = codoncount(mt, 'GENETICCODE', geneticcode, 'FRAME', frame);
count = 0;
for i = 1:4
for j = 1:4
for k = 1:4
count = count + carray(i,j,k);
end
end
end
pstop = (codons.TAA / count) + (codons.TAG / count) + (codons.TGA / count)
pvalue = (1-pstop)^length(protseq)
%% Question 9
% Amino acid string
aa = nt2aa(protseq, 'GENETICCODE', geneticcode, 'FRAME', frame);
% Retrieve top 5 hits from the BLAST
ourprot = getgenpept('NP_008585');
prot1 = getgenpept('YP_220703');
prot2 = getgenpept('AAL79372');
prot3 = getgenpept('ACZ29036');
prot4 = getgenpept('AJA05049');
prot5 = getgenpept('YP_007626040');
% Make a multiple alignment with the proteins
ma = multialign([ourprot, prot1, prot2, prot3, prot4, prot5]);
seqalignviewer(ma)
%% Question 10
% Homo sapiens mitochondrion, complete genome
humanmt = getgenbank('NC_012920');
% Local alignment using Smith-Waterman
[score, alignment] = swalign(protseq, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62');
showalignment(alignment);
% Create a random sequence for comparison
rands = randseq(length(protseq), 'FROMSTRUCTURE', basecount(protseq));
[score, alignment] = swalign(rands, humanmt, 'ALPHABET', 'NT', 'SCORINGMATRIX', 'BLOSUM62');
showalignment(alignment);
%% Question 11
% For all coding regions in Human mtDNA
for i = 1:length(humanmt.CDS)
idx = humanmt.CDS(i).indices;
humanreg = humanmt.Sequence(idx(1):idx(2));
% Score against all ORFs in Aardvark mtDNA
if (length(humanreg) > 0)
for j = 1:length(frames)
start = orfs(j).Start;
stop = orfs(j).Stop;
for k = 1:length(start)
mtreg = mt.Sequence(start(k):stop(k));
% Score randomly permuted regions against one another
scores = [];
numrand = 100;
for i = 1:numrand
randhuman = randseq(length(humanreg), 'FROMSTRUCTURE', basecount(humanreg));
randmt = randseq(length(mtreg), 'FROMSTRUCTURE', basecount(mtreg));
[score, ~] = swalign(randmt, randhuman, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
scores = [scores; score];
end
max_random_score = max(scores);
threshold_score = prctile(scores, 99);
% Score the original sequences
[score, alignment] = swalign(mtreg, humanreg, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
% figure;
% histogram(scores);
% title('Alignment Scores of Randomly Permuted Coding Regions');
if (score > threshold_score)
disp(' ');
disp(alignment);
pval = length(find(scores >= score)) / numrand;
disp(['Frame: ' num2str(frames(j)) ', Human: ' num2str(idx(1)) ' to ' num2str(idx(2)) ', Aardvark: ' num2str(start(k)) ' to ' num2str(stop(k)) ', p-value: ' num2str(pval)]);
end
end
end
end
end
%% Question 12
aa = nt2aa(mt.Sequence,'GENETICCODE', geneticcode);
humanaa = nt2aa(humanmt.Sequence, 'GENETICCODE', geneticcode, 'ACGTOnly', false);
% Global alignment using Needleman-Wunsch
[score, alignment] = nwalign(aa, humanaa, 'ALPHABET', 'AA', 'SCORINGMATRIX', 'BLOSUM62');
showalignment(alignment);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment