Created
May 1, 2021 21:22
-
-
Save chilampoon/ac3e7b473773708fc4a3053479e22fa1 to your computer and use it in GitHub Desktop.
2-state Hidden Markov Model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 2-state Hidden Markov Model\n", | |
"_Chi-Lam Poon_\n", | |
"\n", | |
"This program implements a 2-state HMM for detecting G+C-rich regions in the Vibrio cholerae IEC224 chromosome II sequence. Conceptually, state 1 in the HMM would correspond to the more frequent 'A+T rich' state, whereas state 2 would correspond to the less frequent G+C rich state.\n", | |
"\n", | |
"The starting parameter values should be as follows:\n", | |
"\n", | |
"- Transition probabilities a_11 = .999, a_12 = .001, a_21 = .01, a_22 =\n", | |
".99.\n", | |
"- Initiation probabilities for each state (i.e. the transition probabilities from the 'begin' state into state 1 or 2) should be .996 for state 1, and .004 for state 2; these should be held fixed throughout the Viterbi training\n", | |
"- Emission probabilities (which should also be held fixed) are \n", | |
" 1. e_A = e_T = .291, e_G = e_C = .209 for state1\n", | |
" 2. e_A = e_T = .169, e_G = e_C = .331 for state2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random \n", | |
"\n", | |
"class Edge(object):\n", | |
" def __init__(self, start, end):\n", | |
" self.start = start\n", | |
" self.end = end\n", | |
" \n", | |
" def __str__(self):\n", | |
" return str(self.start) + \"->\" + str(self.end)\n", | |
"\n", | |
" \n", | |
"class Node(object):\n", | |
" '''equals to state; stateName,{emissions}'''\n", | |
" def __init__(self, name):\n", | |
" self.name = name\n", | |
" self.edges = []\n", | |
" \n", | |
" def AddEdge(self, edge):\n", | |
" print(\"Adding to \" + self.name)\n", | |
" self.edges.append(edge)\n", | |
" \n", | |
" def __eq__(self, other):\n", | |
" return self.name == other.name\n", | |
" \n", | |
" def __hash__(self):\n", | |
" return hash(self.name)\n", | |
"\n", | |
" \n", | |
"class Transition(Edge):\n", | |
" '''Connect between states, = transition'''\n", | |
" def __init__(self, start, end, prob):\n", | |
" Edge.__init__(self, start, end)\n", | |
" self.prob = prob\n", | |
"\n", | |
"\n", | |
"class State(Node):\n", | |
" def __init__(self, name, emissions, initial = False):\n", | |
" Node.__init__(self, name)\n", | |
" self.emissions = emissions\n", | |
" self.is_initial = initial\n", | |
" \n", | |
" def GetEmission(self):\n", | |
" r = random.random()\n", | |
" current_prob = 0.0\n", | |
" for state, prob in self.emissions.items():\n", | |
" current_prob += prob\n", | |
" if current_prob > r:\n", | |
" return state\n", | |
" return None\n", | |
" \n", | |
" def GetTransition(self):\n", | |
" r = random.random()\n", | |
" current_prob = 0.0\n", | |
" for edge in self.edges:\n", | |
" current_prob += edge.prob\n", | |
" if current_prob > r:\n", | |
" return edge.end\n", | |
" return None\n", | |
"\n", | |
" \n", | |
"class Graph(object):\n", | |
" def __init__(self):\n", | |
" self.nodes = {}\n", | |
" self.edges = {}\n", | |
" \n", | |
" def add_node(self, node):\n", | |
" self.nodes[node] = node\n", | |
" self.edges[node] = []\n", | |
" \n", | |
" def add_edge(self, edge):\n", | |
" self.nodes[edge.start].AddEdge(edge)\n", | |
" if edge.start not in self.edges.keys():\n", | |
" self.edges[edge.start] = []\n", | |
" self.edges[edge.start].append(edge) \n", | |
" \n", | |
" def get_edges(self, node):\n", | |
" return self.edges[node]\n", | |
" \n", | |
" def __str__(self):\n", | |
" return \"Nodes: \" + str(self.nodes) + \"\\n\" + \"Edges: \" + str(self.edges) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from math import log\n", | |
"\n", | |
" \n", | |
"class HMM(Graph):\n", | |
" def __init__(self):\n", | |
" Graph.__init__(self)\n", | |
" self.current_state = None\n", | |
" \n", | |
" def MakeStep(self):\n", | |
" self.MakeTransition()\n", | |
" self.MakeEmission()\n", | |
" \n", | |
" def MakeEmission(self):\n", | |
" print(self.current_state.GetEmission())\n", | |
" \n", | |
" def MakeTransition(self):\n", | |
" new_state = self.nodes[self.current_state.GetTransition()]\n", | |
" print(\"From \" + self.current_state.name + \" to \" + new_state.name)\n", | |
" self.current_state = new_state\n", | |
" \n", | |
" def RandomWalk(self, start_node, length):\n", | |
" self.current_state = self.nodes[start_node]\n", | |
" for i in range(length):\n", | |
" self.MakeStep()\n", | |
" \n", | |
" def Viterbi(self, genome):\n", | |
" matrix = {}\n", | |
" backtrack = {}\n", | |
" # initiation\n", | |
" for name, state in self.nodes.items():\n", | |
" if not state.is_initial:\n", | |
" matrix[name] = [float('-inf') for i in range(len(genome) + 1)]\n", | |
" backtrack[name] = [\"\" for i in range(len(genome) + 1)]\n", | |
" \n", | |
" \n", | |
" for name, state in self.nodes.items():\n", | |
" if state.is_initial:\n", | |
" for edge in state.edges:\n", | |
" matrix[edge.end][0] = log(edge.prob)\n", | |
" backtrack[edge.end][0] = \"initial\"\n", | |
" \n", | |
" \n", | |
" for s in range(1, len(genome)+1):\n", | |
" nt = genome[s-1]\n", | |
" final_state = ''\n", | |
" final_state_val = float('-inf')\n", | |
" for curr_name, curr_state in self.nodes.items():\n", | |
" if curr_state.is_initial: \n", | |
" continue \n", | |
" \n", | |
" if s == 1:\n", | |
" matrix[curr_name][s] = mlog(log(curr_state.emissions[nt]), matrix[curr_name][s-1])\n", | |
" backtrack[curr_name][s] = curr_name.name\n", | |
" continue\n", | |
" \n", | |
" for prev_name, prev_state in self.nodes.items():\n", | |
" if prev_state.is_initial:\n", | |
" continue\n", | |
" \n", | |
" # get the transition probability \n", | |
" tmp_a = [edge.prob for edge in prev_state.edges if edge.end == curr_name][0]\n", | |
" s_curr = mlog(log(curr_state.emissions[nt]), mlog(matrix[prev_name][s-1], log(tmp_a)))\n", | |
" \n", | |
" # get the max value\n", | |
" if s_curr > matrix[curr_name][s]:\n", | |
" matrix[curr_name][s] = s_curr\n", | |
" backtrack[curr_name][s] = prev_name.name\n", | |
" \n", | |
" if s == len(genome) and matrix[curr_name][s] > final_state_val:\n", | |
" final_state_val = matrix[curr_name][s]\n", | |
" final_state = curr_name.name\n", | |
"\n", | |
" \n", | |
" # termination\n", | |
" states_seq = [final_state]\n", | |
" \n", | |
" # backtracking\n", | |
" for rs in reversed(range(1, len(genome))):\n", | |
" prev_state = [backtrack[name][rs+1] for name in backtrack if name.name == states_seq[0]][0]\n", | |
" states_seq.insert(0, prev_state)\n", | |
" \n", | |
" self.numOfStates(states_seq)\n", | |
" segments, switch = self.numOfSegments(states_seq)\n", | |
" new_probs = self.newProbs(switch)\n", | |
" \n", | |
" return states_seq, segments, new_probs\n", | |
"\n", | |
"\n", | |
" def numOfStates(self, states_seq):\n", | |
" for name, state in self.nodes.items():\n", | |
" if not state.is_initial:\n", | |
" print('State ' + name.name + \"'s occurence = \" + str(states_seq.count(name.name)))\n", | |
" \n", | |
" def numOfSegments(self, states_seq):\n", | |
" segments = {}\n", | |
" seg_start = 0\n", | |
" switch = {}\n", | |
" \n", | |
" state = states_seq[0]\n", | |
" segments[state] = []\n", | |
" switch[(state, state)] = 0\n", | |
"\n", | |
" for i in range(1, len(states_seq)):\n", | |
" new_state = states_seq[i]\n", | |
"\n", | |
" if new_state != state:\n", | |
" # transition occurred\n", | |
" if new_state not in segments:\n", | |
" segments[new_state] = []\n", | |
" if (state, new_state) not in switch:\n", | |
" switch[(state, new_state)] = 0\n", | |
" if (new_state, new_state) not in switch:\n", | |
" switch[(new_state, new_state)] = 0\n", | |
" \n", | |
" segments[state].append((seg_start, i-1))\n", | |
" switch[(state, new_state)] += 1\n", | |
" seg_start = i\n", | |
" state = new_state\n", | |
" else:\n", | |
" switch[(state, new_state)] += 1\n", | |
" if i == len(states_seq) - 1:\n", | |
" # end at the end if nothing happened\n", | |
" segments[state].append((seg_start, i))\n", | |
" \n", | |
" # print the info\n", | |
" for state in segments:\n", | |
" print('Num. of segments in state ' + state + ' = ' + str(len(segments[state])))\n", | |
" \n", | |
" return segments,switch\n", | |
"\n", | |
" def newProbs(self, switch):\n", | |
" # switch is like {('1', '1'): 33, ('1', '2'): 1, ('2', '2'): 37, ('2', '1'): 1}\n", | |
" probs = {}\n", | |
" \n", | |
" for name, state in self.nodes.items():\n", | |
" if not state.is_initial:\n", | |
" probs[name.name] = {}\n", | |
" # count total switches for one node\n", | |
" node_total = 0\n", | |
" for next_name, next_state in self.nodes.items():\n", | |
" if not next_state.is_initial:\n", | |
" if (name.name, next_name.name) in switch:\n", | |
" node_total += switch[(name.name, next_name.name)]\n", | |
" else:\n", | |
" node_total += 0\n", | |
" # calculate\n", | |
" for next_name, next_state in self.nodes.items():\n", | |
" if not next_state.is_initial:\n", | |
" if (name.name, next_name.name) in switch: \n", | |
" prob = switch[(name.name, next_name.name)] / node_total\n", | |
" print(str((name.name, next_name.name)) + \"'s new prob = \" + str(round(prob, 4)))\n", | |
" probs[name.name][next_name.name] = prob\n", | |
" else:\n", | |
" probs[name.name][next_name.name] = 0\n", | |
" return probs\n", | |
"\n", | |
"def mlog(x, y):\n", | |
" return x + y" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Add parameters to the HMM model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n" | |
] | |
} | |
], | |
"source": [ | |
"g = HMM()\n", | |
"g.add_node(State(\"initial\", {}, True))\n", | |
"# emissions\n", | |
"g.add_node(State(\"1\", {\"A\" : 0.291, \"T\" : 0.291, \"G\" : 0.209, \"C\" : 0.209}))\n", | |
"g.add_node(State(\"2\", {\"A\" : 0.169, \"T\" : 0.169, \"G\" : 0.331, \"C\" : 0.331}))\n", | |
"# initial transitions\n", | |
"g.add_edge(Transition(State(\"initial\", {}), State(\"1\", {}), 0.996))\n", | |
"g.add_edge(Transition(State(\"initial\", {}), State(\"2\", {}), 0.004))\n", | |
"\n", | |
"g.add_edge(Transition(State(\"1\", {}), State(\"2\", {}), 0.001))\n", | |
"g.add_edge(Transition(State(\"1\", {}), State(\"1\", {}), 0.999))\n", | |
"g.add_edge(Transition(State(\"2\", {}), State(\"1\", {}), 0.01))\n", | |
"g.add_edge(Transition(State(\"2\", {}), State(\"2\", {}), 0.99))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"State 1's occurence = 35\n", | |
"State 2's occurence = 36\n", | |
"Num. of segments in state 1 = 2\n", | |
"Num. of segments in state 2 = 1\n", | |
"('1', '1')'s new prob = 0.9706\n", | |
"('1', '2')'s new prob = 0.0294\n", | |
"('2', '1')'s new prob = 0.0278\n", | |
"('2', '2')'s new prob = 0.9722\n" | |
] | |
} | |
], | |
"source": [ | |
"# have a try\n", | |
"seq = 'GGAAATTGTTAATATATATGGGGGCGCGCCCACCGGGGCGCGGCTGCGGCGCGCGTTATATATATTCTTTT'\n", | |
"states_seq, segments, new_probs = g.Viterbi(seq)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'1': {'1': 0.9705882352941176, '2': 0.029411764705882353},\n", | |
" '2': {'1': 0.027777777777777776, '2': 0.9722222222222222}}" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"new_probs" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Parse the fasta\n", | |
"\n", | |
"Downloaded from https://www.ncbi.nlm.nih.gov/nuccore/CP003331" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"File name: ./IEC224_chr2.fasta\n", | |
">CP003331.1 Vibrio cholerae IEC224 chromosome II, complete sequence\n" | |
] | |
} | |
], | |
"source": [ | |
"def parse_fasta(fasta_file):\n", | |
" file = open(fasta_file, 'r')\n", | |
" fasta = ''\n", | |
" \n", | |
" for line in file:\n", | |
" if line.startswith('>') == False:\n", | |
" fasta += line.replace('\\n','')\n", | |
" else:\n", | |
" print(line.replace('\\n',''))\n", | |
" return fasta\n", | |
"\n", | |
"\n", | |
"fasta_file = './IEC224_chr2.fasta'\n", | |
"print('File name: ' + fasta_file)\n", | |
"fa = parse_fasta(fasta_file)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Iteration 1...\n", | |
"State 1's occurence = 1061417\n", | |
"State 2's occurence = 10719\n", | |
"Num. of segments in state 1 = 48\n", | |
"Num. of segments in state 2 = 47\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0044\n", | |
"('2', '2')'s new prob = 0.9956\n", | |
"{'1': {'1': 0.9999557195293833, '2': 4.428047061661027e-05}, '2': {'1': 0.004384737382218491, '2': 0.9956152626177815}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 2...\n", | |
"State 1's occurence = 1067281\n", | |
"State 2's occurence = 4855\n", | |
"Num. of segments in state 1 = 12\n", | |
"Num. of segments in state 2 = 11\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0023\n", | |
"('2', '2')'s new prob = 0.9977\n", | |
"{'1': {'1': 0.9999896934262799, '2': 1.0306573720110937e-05}, '2': {'1': 0.0022657054582904223, '2': 0.9977342945417096}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 3...\n", | |
"State 1's occurence = 1068265\n", | |
"State 2's occurence = 3871\n", | |
"Num. of segments in state 1 = 8\n", | |
"Num. of segments in state 2 = 7\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0018\n", | |
"('2', '2')'s new prob = 0.9982\n", | |
"{'1': {'1': 0.9999934473126493, '2': 6.5526873506923385e-06}, '2': {'1': 0.0018083182640144665, '2': 0.9981916817359855}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 4...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 5...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 6...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 7...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 8...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n", | |
"Iteration 9...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n", | |
"Adding to initial\n", | |
"Adding to initial\n", | |
"Adding to 1\n", | |
"Adding to 1\n", | |
"Adding to 2\n", | |
"Adding to 2\n", | |
"\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in range(1, 10):\n", | |
" print('Iteration ' + str(i) + '...')\n", | |
" states_seq, segments, new_probs = g.Viterbi(fa)\n", | |
" print(new_probs)\n", | |
" \n", | |
" g = HMM()\n", | |
" g.add_node(State(\"initial\", {}, True))\n", | |
" # emissions\n", | |
" g.add_node(State(\"1\", {\"A\" : 0.291, \"T\" : 0.291, \"G\" : 0.209, \"C\" : 0.209}))\n", | |
" g.add_node(State(\"2\", {\"A\" : 0.169, \"T\" : 0.169, \"G\" : 0.331, \"C\" : 0.331}))\n", | |
" # initial transitions\n", | |
" g.add_edge(Transition(State(\"initial\", {}), State(\"1\", {}), 0.996))\n", | |
" g.add_edge(Transition(State(\"initial\", {}), State(\"2\", {}), 0.004))\n", | |
" \n", | |
" g.add_edge(Transition(State(\"1\", {}), State(\"2\", {}), new_probs[\"1\"][\"2\"]))\n", | |
" g.add_edge(Transition(State(\"1\", {}), State(\"1\", {}), new_probs[\"1\"][\"1\"]))\n", | |
" g.add_edge(Transition(State(\"2\", {}), State(\"1\", {}), new_probs[\"2\"][\"1\"]))\n", | |
" g.add_edge(Transition(State(\"2\", {}), State(\"2\", {}), new_probs[\"2\"][\"2\"]))\n", | |
" \n", | |
" print('\\n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Iteration 10...\n", | |
"State 1's occurence = 1068256\n", | |
"State 2's occurence = 3880\n", | |
"Num. of segments in state 1 = 7\n", | |
"Num. of segments in state 2 = 6\n", | |
"('1', '1')'s new prob = 1.0\n", | |
"('1', '2')'s new prob = 0.0\n", | |
"('2', '1')'s new prob = 0.0015\n", | |
"('2', '2')'s new prob = 0.9985\n", | |
"{'1': {'1': 0.9999943833635228, '2': 5.616636477245601e-06}, '2': {'1': 0.0015463917525773195, '2': 0.9984536082474227}}\n" | |
] | |
} | |
], | |
"source": [ | |
"# one more time\n", | |
"for i in range(10, 11):\n", | |
" print('Iteration ' + str(i) + '...')\n", | |
" states_seq, segments, new_probs = g.Viterbi(fa)\n", | |
" print(new_probs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'1': [(0, 275666),\n", | |
" (276364, 666684),\n", | |
" (667662, 669450),\n", | |
" (670313, 746323),\n", | |
" (746563, 952633),\n", | |
" (953264, 1023943),\n", | |
" (1024419, 1072135)],\n", | |
" '2': [(275667, 276363),\n", | |
" (666685, 667661),\n", | |
" (669451, 670312),\n", | |
" (746324, 746562),\n", | |
" (952634, 953263),\n", | |
" (1023944, 1024418)]}" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"segments" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GCCAACGAGCCAGTGACGATGCGCAGCCCGAAGGCGTGGCGGGGAAGCGGATAGTGAGATCGGCCGCGCTGGATTGTTCAATCCACGGTGTGCTGGAAGGGGCCGAAATCAGCAGCGCCATGCGCTGCTCGCCCAGCGTGAGCCAATAGACACCTGCGGCATACGGCGAGAGAAGATCGCGACTTTCCACTTCGGCGTAATCGGCAGGCGCAAGCTCTGCTGCTAACAAGGTTGCACCTTGCGAAGATGCCAGACGCACCGCCAGCGGCTGGCTGAGGTTTTCGGCGCTCAGCCTCACTTGGTAGCCTTGAAAGGTCTGATTGAACTTATGAATAAAAGTCACCGCAATCGCAAGATCGTTAAGCGATGCACTTGGCAGTGCTTGTCCACATTGGGTTTGACTGAGGATGGCCTCTTTAATTGGTTGCCAAAAAGGCTCGGCTACGGGCGTGGTAGACAGTGTGTGGATGAGCTCGTGCCACGCTTGCGTGGGATGCCCCTCAACCAATAGCTGGTAACTGCTTTGCAGCGGTGTGGTGCGAAACCAAGCATGGGTGGGGCTTGGCGGCGTTTCCGCTATCGCGGCGCCGCTTAGTAATGCGCTGAGTATGCCAAGTCGCGCAATGTGATTGCGAAAAGAGGCGCAGATCATACGGAGGCTCCTTTGAGGCGATAACCGACGCCGCGCAGGGTCTC\n", | |
"\n", | |
"GCGGGGCCACTGCGCGTTAACGGCTCTCATGCGCAAGGGGACTATTTGGTTCCGCTGGCCACCACAGAAGCGGCGTTAGTGGCGTCATACCATCGTGGTAGCCAACTCATCACGGCAGCAGGCGGCGCAAGTGCGCTACTGCTCAATGAAGGGGTGACTCGCACACCCGTGTTTGCTTTTCTTTCGCTGGCGCAAGCTGGGCAATTTGTCGGTTGGGTGACCAGCCAATTTGAGCAGATGAAAGAGATAGCGCAGTCCACCACCGCGCACGGCAAACTGAAAGATATTCAGGTCAATATCGAAGGGAATCACGTCTATCTGGTGTTTGAATACACCACCGGCGATGCCAGCGGTCAAAATATGGTGACGATCGCCACTCACGCTGTTTTTGAGTTCATCATGCGCCACTCACCGGTTGCGCCAGTGCAAGCCTTTCTCGATGGCAATCTCTCCGGTGACAAAAAAGCCAATAGCTATACGCTGCGCAGCGTGCGCGGCAAAAAAGTCAGCGCTGAGGTGCATCTCTCTGCCGAGTTAGTCAAAAAGTATCTCCACACCACGCCAGAGCAGATGGTGCAGTTTGGGCAAATGACGACAGTGGGCGGCGCACTGAGCGGCGCGATTGGTGTGAATGCTCATTACGCCAATGCCCTTGCCGCGCTTTATATTGCGTGTGGGCAAGATGCCGCGTGTGTAGCCGAATCCGCAATAGGGATGACGCGCATGGAGATCCACCCTCACGGCGGTTTATACGCCAGCGTCACCCTGCCTAATTTGATGGTCGGAACCGTTGGTGGTGGCACCCATCTCCCTAGCCAACACGCCTGTTTATCGCTGATGGGATTGGCCGGGCAAGGCCACGCGCGAGCCTTAGCTGAAGTGGCCGCAGCCCTCTGTTTAGCTGGTGAATTATCGATCGTCGGGGCATTTTGCGCGGGACATTTTTCCCGTGCGCACCATAAGCTCGCGCGCTGAG\n", | |
"\n", | |
"GGTGCGCGCCAGCACCCACGCCAGCACAATGCCTATCGGCAGCAACCACAAGGTCGCATACCCAGCCACTTTTAAACTCAGCGCCAGCGCCGCCCATTCGTAATCGGAGAGACCCATCAATTACTCACTCACCGACTCAAAACCAAAACGTTGCAAGATTTGCTGCGCGGCGTCTGAGCGTAAATACGCCACCCATTCGGCGCTCGCCGCTTTATCATTCAATTGCGCCAGCGGATAGCGGATCGGCTGGTGAGACTGAGCTGAAAATGCCGTCACGATCGTCACTTTATCACTGAGCAGCGCATCGGTTTTGTACACGATGCCAAGCGGCGATTCACCACGCTCGACCAACGCTAGCGCCAAACGCACGTTATTGGTTTGCGCAAGGCGTGATTCCAGCTCAGGCCACACCCCCGCATGTTGTAGCGCTTGCTTGGCATACATGCCCGCCGGCACCGCATCCACTTGCGCAACCGCCAAGCGAGAATCGGCCAAGGCCGTTTGCCAAGCCGCGGCATCTTGCAGCTCAAAACTTGCCACCGGCTGTGCAGTCGGGCGAATCAGCACCAAACTATTCGCCGCCAACGTCACCACCTTGTTCGGTTTCACTAAGCCTTTCTCGACTAAGTAATTCGCCCACTCTTCATTGGCGGAAATAAACAGATCCGCAGGTGCGCCCGCTTCAATTTGACGCGCCAGCGAAGAAGAGCCGCCGTACACCGTCACCAGCTTCACATCATGCTGCTGTGAGTAGTCGGCCACTAACGCATTGACGGCATTGGTCATCGACGAGGCCGCGTAAATGTGCAGTGTCTCTTTCGCGCTCGCCAGCGGCGCGTTAAGCGCCACACACAGCAGGAG\n", | |
"\n", | |
"CGCGACGACGGCTATGGCCACGCTCACCACGGTAGCTACCACTGCTCTCACCACTACGGTTGCGCTCAAAACGACGCTCACCTTCACGGAAAGGACGGCCTTCGCCACCACGGCCACGACCACCTTCGCGGCCGCCTTCACGACCACCACGGAAACCGCCTTCACGGTTGCCTTCTGGACGACGACCACCTTCGCGACGTGGACCACCTTCACGACGAGCACCACCACGAGATTCGCG\n", | |
"\n", | |
"CGCCCCCGGAGACACACGCAGCGCAGTGCGCTCTGCGCCAATACGAGCGATCACCGCATCCACTACTTCCAAAGCAAAACGCGCCATATTGGCCGGCGTTTCACCGTACTCATCGCTGCGTTGGTTAGAATCAAAATGCAGGAACTGGTCGATGAGATAGCCGTTCGCGCCATGAATCTCGACTCCATCAAAACCCGCCAAGCGTGCATTTTCCGCCGCTTGTGCGTAATCGGCCACCAGTTGCTTGATCTCCGCTTGGCTCGCCGCTTTCGGCACTGTGTACTGCAATTCGCGGCGGCGTGGCACGCTGCCTTCGACACCGAGAGCTGAAGGCGCTAGAACGTATTCACCCGCAAAAAAGGCAGGGTGAGCCACGCGTCCGGTATGCCATAACTGAGCGAAAATCTTACCGCCATTGGCATGCACGGCGTCCGTTACTTTTTTCCAGCCGGCAATTTGCGCTTGCGTAAACAGCCCCGGAGTGTTCGGGTAGCCTTGAGCATCGGGGCGAATAATGGTGGCTTCGGAAATGATCAGCCCAGCATCGGCACGGCGTGCGTAATACGCCACCATGGCATCGGTCGGCACCAAATCGTCATCAGCCATGCAGCGCGTCAGTGGCGCCATCG\n", | |
"\n", | |
"GCACACGCAGTGCCGAACAGCGTTGGCCAGCAGAGGCAAAAGCGGAACGCAGCACATCACGCACCACTTGCTCTGGCAGCGCGGTACTGTCGACAATCATGGCGTTTTGACCGCCAGTTTCTGCAATAAATGGTACCGGAGCCGCTTCACGTTGCGCTAACGTTTGGTTAATGCGCTGCGCCGTGGCGGTCGAGCCAGTAAACGCCACACCCGCGATGGCCGGATGAGAAGTGAGTGCGCTGCCGATATCGGCACCGCGTCCCGGCAAAAGTTGGATGGTTCCAGCAGGGAAACCCGCTTCTTGCATCAGCTCAACCGCGCGATACGCGATCAAACTGGTTTGCTCGGCAGGTTTGGCAATCACGGTATTTCCGGCCACCAAGGCGGCACTGATTTGGCCAAGGAAAATCGCCAGCGGGAAGTTCCACGGACTAATACACACAAACACTCCGCGACCTTGGCGGCTGACACGAC\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"for region in segments['2']:\n", | |
" print(fa[region[0]:region[1]]+'\\n')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment