Created
September 13, 2020 15:13
-
-
Save SaiNikhileshReddy/7307725870cc11f5bdcb3915dbe8b248 to your computer and use it in GitHub Desktop.
[3D] Construct the de Bruijn Graph of a String
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# (3D) Construct the de Bruijn Graph of a String" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Reference : http://rosalind.info/problems/ba3d/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Input : \"Read\" strings\n", | |
"#### Output : Final Genome String" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def PatternFinder(reads, pattern):\n", | |
" output = set()\n", | |
" for read in reads:\n", | |
" if read.startswith(pattern):\n", | |
" output.add(read)\n", | |
" return sorted(output)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def DeBruijn(small_reads):\n", | |
" data = {}\n", | |
" for kmer in small_reads:\n", | |
" overlap = PatternFinder(small_reads,kmer[1:])\n", | |
" if len(overlap):\n", | |
" data[kmer] = overlap\n", | |
" \n", | |
" output = []\n", | |
" for key in sorted(data.keys()):\n", | |
" output.append(\"\"+key+\" -> \"+','.join(data[key])+\"\")\n", | |
" return output" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Download the sample dataset from this [link](http://bioinformaticsalgorithms.com/data/extradatasets/assembly/De_Bruijn_Graph_from_a_String.txt)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataset = '(3D)_De_Bruijn_Graph_from_a_String' #this path is to sample dataset from rosalind to test whether the program works or not before doing the test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Its not matching to original output.\n", | |
"[231] =====\n", | |
"Output : ATGTCTACCGT -> TGTCTACCGTA,TGTCTACCGTC\n", | |
"Actual_Output : ATGTCTACCGT -> TGTCTACCGTA\n", | |
"[331] =====\n", | |
"Output : CCACATTACTA -> CACATTACTAC\n", | |
"Actual_Output : CCACATTACTA -> CACATTACTAC,CACATTACTAC\n", | |
"[718] =====\n", | |
"Output : GTGTCTACCGT -> TGTCTACCGTA,TGTCTACCGTC\n", | |
"Actual_Output : GTGTCTACCGT -> TGTCTACCGTC\n" | |
] | |
} | |
], | |
"source": [ | |
"with open('Dataset_And_Answers\\\\'+dataset+'.txt') as file:\n", | |
" data = [line.rstrip() for line in file]\n", | |
"\n", | |
"kmer_size = int(data[1])\n", | |
"Genome = data[2]\n", | |
"small_reads = [Genome[index:(index+kmer_size-1)] for index in range(len(Genome)-kmer_size+2)]\n", | |
"Actual_Output = data[4:]\n", | |
"\n", | |
"output = DeBruijn(small_reads)\n", | |
"\n", | |
"if output == Actual_Output:\n", | |
" print('Output matches and program works.')\n", | |
"else:\n", | |
" print('Its not matching to original output.')\n", | |
" for (index,(a,b)) in enumerate(zip(output,Actual_Output),0):\n", | |
" if a != b:\n", | |
" print(f'[{index}] =====')\n", | |
" print('Output :',a)\n", | |
" print('Actual_Output :',b)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Working with test dataset." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Output Generated\n" | |
] | |
} | |
], | |
"source": [ | |
"dataset = 'rosalind_ba3d' # this is a test dataset\n", | |
"with open('Dataset_And_Answers\\\\'+dataset+'.txt') as file:\n", | |
" data = [line.rstrip() for line in file]\n", | |
"\n", | |
"kmer_size = int(data[0])\n", | |
"Genome = data[1]\n", | |
"small_reads = [Genome[index:(index+kmer_size-1)] for index in range(len(Genome)-kmer_size+2)]\n", | |
"\n", | |
"output = DeBruijn(small_reads)\n", | |
"\n", | |
"# storing the final output into a file to upload it to rosalind\n", | |
"with open('Dataset_And_Answers\\\\'+dataset+'_answer.txt','w') as file: \n", | |
" for line in output:\n", | |
" file.write(line+'\\n')\n", | |
"\n", | |
"print('Output Generated')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "BioPython", | |
"language": "python", | |
"name": "biopython" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment