Last active
December 17, 2015 18:19
-
-
Save sld/5652006 to your computer and use it in GitHub Desktop.
ROSALIND 20 # Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'set' | |
require_relative 'graph_and_ant_colony' # https://gist.github.com/sld/2711080 | |
class String | |
def indexes letter | |
(0 .. self.length - 1).find_all { |i| self[i,1] == letter } | |
end | |
end | |
class Array | |
def swap ind1, ind2 | |
tmp = self[ind1] | |
self[ind1] = self[ind2] | |
self[ind2] = tmp | |
end | |
end | |
# Task names starts by " Task: TASK_NAME" | |
# Example: "# Task: Computing GC Content " | |
module Rosalind20 | |
def count_dna_nucliotides( dna_string ) | |
{'A' => dna_string.count('A'), 'C' => dna_string.count('C'), 'G' => dna_string.count('G'), 'T' => dna_string.count('T')} | |
end | |
def dna_to_rna( dna_string ) | |
dna_string.tr "T", "U" | |
end | |
def reverse_complement( dna_string ) | |
dna_string.reverse.tr("A","Z").tr("C","Q").tr("T","A").tr("G","C").tr("Q","G").tr("Z","T") | |
end | |
def parse_fasta_file(filename) | |
fasta_pairs = {} | |
fasta_id = nil | |
File.open(filename).each_line do |line| | |
if line[0] == '>' | |
fasta_id = line.chomp | |
fasta_id[0] = "" | |
fasta_pairs[fasta_id] = "" | |
else | |
fasta_pairs[fasta_id] += line.chomp | |
end | |
end | |
return fasta_pairs | |
end | |
def parse_adjacency_list_file filename | |
graph_data = {:nodes_count => nil, :adjacency_list => []} | |
File.open(filename).each_line do |line| | |
splitted_line = line.split | |
if splitted_line.count == 2 | |
graph_data[:adjacency_list] << [splitted_line[0].to_i, splitted_line[1].to_i] | |
else | |
graph_data[:nodes_count] = splitted_line.first.to_i | |
end | |
end | |
return graph_data | |
end | |
def parse_set_file filename | |
lines = File.open(filename).lines.to_a | |
universum_set = (1..lines[0].to_i).to_a | |
lines[1].delete! "{}" | |
lines[2].delete! "{}" | |
set1 = lines[1].split(",").map{|e| e.to_i} | |
set2 = lines[2].split(",").map{|e| e.to_i} | |
return [universum_set.to_set, set1.to_set, set2.to_set] | |
end | |
def calc_gc(dna_string) | |
100.0 * ( dna_string.count('G') + dna_string.count('C') ) / dna_string.length | |
end | |
# Task: Computing GC Content | |
def find_max_gc( fasta_pairs ) | |
max_fasta_pair = fasta_pairs.max_by{ |fasta_id, dna_string| calc_gc(dna_string) } | |
return [max_fasta_pair[0], calc_gc(max_fasta_pair[1])] | |
end | |
def get_substring_data dna_string, substring_length, substring_ind | |
start_ind = substring_ind | |
end_ind = start_ind + substring_length | |
{:string => dna_string[start_ind...end_ind], :start_ind => start_ind + 1} | |
end | |
# Task: Finding a Motif in DNA | |
def get_substring_start_indexes( dna_string, substring ) | |
indexes = [] | |
legal_substring_count = dna_string.length - substring.length | |
for i in 0...legal_substring_count | |
substring_data = get_substring_data(dna_string, substring.length, i) | |
substring_data[:string] == substring ? indexes << substring_data[:start_ind] : nil | |
end | |
return indexes | |
end | |
# Task: A Brief Introduction to Graph Theory | |
def make_directed_graph fasta_pairs, k | |
graph = [] | |
fasta_pairs.each do |top_fasta_id, top_dna_string| | |
fasta_pairs.each do |fasta_id, dna_string| | |
if top_dna_string[-k..-1] == dna_string[0...k] && top_fasta_id != fasta_id | |
graph << [top_fasta_id, fasta_id] | |
end | |
end | |
end | |
return graph | |
end | |
# ("AGACCTGCCG", "CCTGCCGGAA") => CCTGCCG | |
def find_start_substring( src_str, dst_str ) | |
start_substring = "" | |
indexes = dst_str.indexes(src_str[-1]).reverse | |
indexes.each do |ind| | |
if src_str[-1-ind..-1] == dst_str[0..ind] | |
start_substring = dst_str[0..ind] | |
break | |
end | |
end | |
return start_substring | |
end | |
def make_overlap_graph(fasta_pairs) | |
overlap_graph = [] | |
strings_arr = fasta_pairs.values | |
for i in 0...strings_arr.length | |
for j in i+1...strings_arr.length | |
if strings_arr[i][strings_arr[j]] || strings_arr[j][strings_arr[i]] | |
next | |
else | |
start_substring = find_start_substring( strings_arr[i], strings_arr[j] ) | |
end_substring = find_start_substring( strings_arr[j], strings_arr[i] ) | |
overlap_graph << {:src => strings_arr[i], :dst => strings_arr[j], :to => start_substring, :from => end_substring} | |
end | |
end | |
end | |
return overlap_graph | |
end | |
def get_edges_from_overlap_graph overlap_graph | |
nodes = Set.new | |
edges = Set.new | |
overlap_graph.each do |edge| | |
node_src = nodes.find{|e| e.name == edge[:src]} || Node.new(edge[:src]) | |
node_dst = nodes.find{|e| e.name == edge[:dst]} || Node.new(edge[:dst]) | |
nodes << node_dst | |
nodes << node_src | |
if edge[:to].length > 0 | |
new_edge = Edge.new(node_src, node_dst, -edge[:to].length) | |
edges << new_edge if !edges.find{|e| e == new_edge} | |
end | |
if edge[:from].length > 0 | |
new_edge = Edge.new(node_dst, node_src, -edge[:from].length) | |
edges << new_edge if !edges.find{|e| e == new_edge} | |
end | |
end | |
return edges | |
end | |
def get_superstring_from_route route | |
superstring = "" | |
route.edges.each_with_index do |edge, i| | |
src_str = edge.from.name | |
dst_str = edge.to.name | |
substr_weight = edge.weight | |
if i == 0 | |
superstring = src_str.clone | |
superstring[substr_weight..-1] = dst_str | |
else | |
superstring[substr_weight..-1] = dst_str | |
end | |
end | |
return superstring | |
end | |
def check_superstring superstring, test_strings | |
test_strings.each do|str| | |
return "Uncorrect superstring!" if !superstring[str] | |
end | |
return "Good superstring!" | |
end | |
# Task: Genome Assembly as Shortest Superstring | |
def get_best_route(overlap_graph, need_print = false) | |
good_edges = overlap_graph.get_more_than_half_weight_edges | |
good_graph = Graph.new good_edges, :bidirectional => false, :sort_edges => true | |
superstring = get_superstring_from_route(good_graph) | |
p superstring.length | |
p superstring if need_print | |
end | |
# Task: Counting Point Mutations | |
def hamming_distance str1, str2 | |
distance = 0 | |
for i in 0...str1.length | |
if str1[i] != str2[i] | |
distance += 1 | |
end | |
end | |
return distance | |
end | |
def get_corrects dna_strings | |
appears = {} | |
for i in 0...dna_strings.length | |
for j in i+1...dna_strings.length | |
appears[dna_strings[i]] ||= 0 | |
if dna_strings[i] == dna_strings[j] || reverse_complement(dna_strings[i]) == dna_strings[j] | |
appears[dna_strings[i]] += 1 | |
end | |
end | |
end | |
appears.find_all{|str, appears_count| appears_count > 0}.collect{|arr| arr[0]} | |
end | |
def get_corrections(incorrects, corrects) | |
corrections = [] | |
incorrects.each do |incorrect| | |
corrects.each do |correct| | |
ham_dist = hamming_distance(incorrect, correct) | |
if ham_dist == 1 | |
corrections << [incorrect, correct] | |
break | |
else | |
correct_complement = reverse_complement(correct) | |
compl_ham_dist = hamming_distance(incorrect, correct_complement) | |
if compl_ham_dist == 1 | |
corrections << [incorrect, correct_complement] | |
break | |
end | |
end | |
end | |
end | |
return corrections | |
end | |
# Task: Error Correction in Reads | |
def get_corrections_from_strings(dna_strings) | |
corrects = get_corrects(dna_strings) | |
incorrects = dna_strings - corrects | |
corrections = get_corrections(incorrects, corrects) | |
corrections.each{ |(incorrect, correction)| puts "#{incorrect}->#{correction}" } | |
end | |
def codon_table | |
codon_table_arr = "UUU F CUU L AUU I GUU V | |
UUC F CUC L AUC I GUC V | |
UUA L CUA L AUA I GUA V | |
UUG L CUG L AUG M GUG V | |
UCU S CCU P ACU T GCU A | |
UCC S CCC P ACC T GCC A | |
UCA S CCA P ACA T GCA A | |
UCG S CCG P ACG T GCG A | |
UAU Y CAU H AAU N GAU D | |
UAC Y CAC H AAC N GAC D | |
UAA Stop CAA Q AAA K GAA E | |
UAG Stop CAG Q AAG K GAG E | |
UGU C CGU R AGU S GGU G | |
UGC C CGC R AGC S GGC G | |
UGA Stop CGA R AGA R GGA G | |
UGG W CGG R AGG R GGG G".split.each_slice(2).to_a | |
codon_table_hash = Hash[codon_table_arr] | |
codon_table_hash["UAA"] = "" | |
codon_table_hash["UAG"] = "" | |
codon_table_hash["UGA"] = "" | |
return codon_table_hash | |
end | |
# Task: Translating RNA into Protein | |
def rna_into_protein rna_string | |
rna_string.chars.each_slice(3).map(&:join).map{|str3| codon_table[str3]}.join | |
end | |
def permutations(array, permutations_array, ind, permutation_ind) | |
if permutation_ind == array.length-1 | |
return permutations_array | |
else | |
for i in ind...array.length | |
new_arr = array.clone | |
new_arr.swap(ind, i) | |
permutations_array << new_arr | |
permutations(new_arr, permutations_array, ind+1, permutation_ind+1) | |
end | |
end | |
return permutations_array.uniq | |
end | |
# Task: RNA Splicing | |
def rna_without_introns dna_string, introns | |
introns.each{ |intron| dna_string[intron] = "" } | |
rna_into_protein( dna_to_rna( dna_string ) ) | |
end | |
# Task: Enumerating Gene Orders | |
def permutations_of_length length | |
array = (1..length).to_a | |
total_length = array.inject(1){|s,e| s*e} | |
permutations = permutations(array, [], 0, 0) | |
file = File.new 'perm', 'w' | |
file.puts "#{total_length}" | |
permutations.each{|perm| file.puts perm.join(" ")} | |
end | |
# Task: Enumerating k-mers Lexicographically | |
def k_mers(array, k) | |
array.repeated_permutation(k).each{|sub_arr| puts sub_arr.join} | |
end | |
# Task: k-Mer Composition | |
def composition_4mer( dna_string ) | |
cnt = 4 | |
permutations = Hash[['A', 'C', 'G', 'T'].repeated_permutation(cnt).map{|e| [e.join, 0]}] | |
for i in 0..(dna_string.length - cnt) | |
permutations[dna_string[i...i+cnt]] += 1 if permutations[dna_string[i...i+cnt]] | |
end | |
return permutations.values | |
end | |
# Task: Inferring mRNA from Protein | |
def protein_to_rna_possible_count( protein_string, mod = 1000000 ) | |
rna_freq_table = {} | |
codon_table.values.each do |elem| | |
rna_freq_table[elem] ||= 0 | |
rna_freq_table[elem] += 1 | |
end | |
possible_count = 1 | |
protein_string.each_char do |char| | |
possible_count = rna_freq_table[char] * possible_count | |
end | |
return ( ( possible_count * rna_freq_table[""] ) % mod ) | |
end | |
# Task: Mendel's First Law | |
# k - dominant homo; m - hetero; n - recessive homo | |
def dominant_probability( k, m, n) | |
population = k + m + n | |
(k / population) * ((k-1) / (population - 1) + m / (population - 1) + n / (population - 1)) + | |
(m / population) * (k / (population - 1) + 0.75*(m-1) / (population - 1) + 0.5*n / (population - 1)) + | |
(n / population) * (k / (population - 1) + 0.5*m / (population - 1)) | |
end | |
# Task: Completing a Tree | |
def min_edges_to_be_tree adjacency_list, nodes_count | |
nodes_groups = Array.new( nodes_count ) | |
(1..nodes_count).to_a.each{ |i| nodes_groups[i-1] = [i].to_set } | |
adjacency_list.each do |(node_from, node_to)| | |
has_group = false | |
for i in 0...nodes_groups.length | |
node_group = nodes_groups[i] | |
if node_group.include?(node_from) || node_group.include?(node_to) | |
has_group = true | |
node_group << node_from | |
node_group << node_to | |
end | |
end | |
nodes_groups << [node_from, node_to].to_set if !has_group | |
end | |
(1..nodes_count).to_a.each do |node| | |
to_merge_groups = [] | |
nodes_groups.each_with_index do |node_group, i| | |
to_merge_groups << i if node_group.include?(node) | |
end | |
merged_group = [].to_set | |
to_merge_groups.each{|ind| merged_group += nodes_groups[ind]; nodes_groups[ind] = nil} | |
nodes_groups.compact! | |
nodes_groups << merged_group | |
end | |
return (nodes_groups.count - 1) | |
end | |
# Task: Counting Subsets | |
def subsets_count n | |
2**n % 1000000 | |
end | |
# Task: Introduction to Set Operations | |
def set_operations set1, set2, universum_set | |
{:union => set1 + set2, :intersection => set1 & set2, :difference1 => set1 - set2, :difference2 => set2 - set1, | |
:complement1 => universum_set - set1, :complement2 => universum_set - set2} | |
end | |
# Task: Constructing a De Bruijn Graph | |
def make_de_bruijn dna_strings | |
for i in 0...dna_strings.length | |
dna_strings << reverse_complement(dna_strings[i]) | |
end | |
dna_strings.sort! | |
dna_strings = dna_strings.to_set | |
k = dna_strings.first.length - 1 | |
edges = Set.new | |
nodes = {} | |
dna_strings.each do |dna_str| | |
nodes[dna_str[0...k]] ||= Node.new dna_str[0...k] | |
nodes[dna_str[1...k+1]] ||= Node.new dna_str[1...k+1] | |
edges << Edge.new( nodes[dna_str[0...k]], nodes[dna_str[1...k+1]], 0) | |
end | |
return edges | |
end | |
end | |
include Rosalind20 | |
dna_strings = %w( | |
TGAT | |
CATG | |
TCAT | |
ATGC | |
CATC | |
CATC | |
) | |
edges = make_de_bruijn( dna_strings ) | |
edges.each {|edge| puts edge.to_s(:with_brackets => true)} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment