sainikhileshreddy · September 13, 2020 15:13
diff --git a/(3D)Construct_the_de_Bruijn_Graph_of_a_String.ipynb b/(3D)Construct_the_de_Bruijn_Graph_of_a_String.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# (3D) Construct the de Bruijn Graph of a String"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reference : http://rosalind.info/problems/ba3d/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Input  : \"Read\" strings\n",
    "#### Output : Final Genome String"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def PatternFinder(reads, pattern):\n",
    "    output = set()\n",
    "    for read in reads:\n",
    "        if read.startswith(pattern):\n",
    "            output.add(read)\n",
    "    return sorted(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def DeBruijn(small_reads):\n",
    "    data = {}\n",
    "    for kmer in small_reads:\n",
    "        overlap = PatternFinder(small_reads,kmer[1:])\n",
    "        if len(overlap):\n",
    "            data[kmer] = overlap\n",
    "    \n",
    "    output = []\n",
    "    for key in sorted(data.keys()):\n",
    "        output.append(\"\"+key+\" -> \"+','.join(data[key])+\"\")\n",
    "    return output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download the sample dataset from this [link](http://bioinformaticsalgorithms.com/data/extradatasets/assembly/De_Bruijn_Graph_from_a_String.txt)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = '(3D)_De_Bruijn_Graph_from_a_String' #this path is to sample dataset from rosalind to test whether the program works or not before doing the test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Its not matching to original output.\n",
      "[231] =====\n",
      "Output        : ATGTCTACCGT -> TGTCTACCGTA,TGTCTACCGTC\n",
      "Actual_Output : ATGTCTACCGT -> TGTCTACCGTA\n",
      "[331] =====\n",
      "Output        : CCACATTACTA -> CACATTACTAC\n",
      "Actual_Output : CCACATTACTA -> CACATTACTAC,CACATTACTAC\n",
      "[718] =====\n",
      "Output        : GTGTCTACCGT -> TGTCTACCGTA,TGTCTACCGTC\n",
      "Actual_Output : GTGTCTACCGT -> TGTCTACCGTC\n"
     ]
    }
   ],
   "source": [
    "with open('Dataset_And_Answers\\\\'+dataset+'.txt') as file:\n",
    "    data = [line.rstrip() for line in file]\n",
    "\n",
    "kmer_size = int(data[1])\n",
    "Genome = data[2]\n",
    "small_reads = [Genome[index:(index+kmer_size-1)] for index in range(len(Genome)-kmer_size+2)]\n",
    "Actual_Output = data[4:]\n",
    "\n",
    "output = DeBruijn(small_reads)\n",
    "\n",
    "if output == Actual_Output:\n",
    "    print('Output matches and program works.')\n",
    "else:\n",
    "    print('Its not matching to original output.')\n",
    "    for (index,(a,b)) in enumerate(zip(output,Actual_Output),0):\n",
    "        if a != b:\n",
    "            print(f'[{index}] =====')\n",
    "            print('Output        :',a)\n",
    "            print('Actual_Output :',b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with test dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Output Generated\n"
     ]
    }
   ],
   "source": [
    "dataset = 'rosalind_ba3d' # this is a test dataset\n",
    "with open('Dataset_And_Answers\\\\'+dataset+'.txt') as file:\n",
    "    data = [line.rstrip() for line in file]\n",
    "\n",
    "kmer_size = int(data[0])\n",
    "Genome = data[1]\n",
    "small_reads = [Genome[index:(index+kmer_size-1)] for index in range(len(Genome)-kmer_size+2)]\n",
    "\n",
    "output = DeBruijn(small_reads)\n",
    "\n",
    "# storing the final output into a file to upload it to rosalind\n",
    "with open('Dataset_And_Answers\\\\'+dataset+'_answer.txt','w') as file: \n",
    "    for line in output:\n",
    "        file.write(line+'\\n')\n",
    "\n",
    "print('Output Generated')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "BioPython",
   "language": "python",
   "name": "biopython"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# (3D) Construct the de Bruijn Graph of a String"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Reference : http://rosalind.info/problems/ba3d/"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Input : \"Read\" strings\n",
	"#### Output : Final Genome String"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"def PatternFinder(reads, pattern):\n",
	" output = set()\n",
	" for read in reads:\n",
	" if read.startswith(pattern):\n",
	" output.add(read)\n",
	" return sorted(output)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def DeBruijn(small_reads):\n",
	" data = {}\n",
	" for kmer in small_reads:\n",
	" overlap = PatternFinder(small_reads,kmer[1:])\n",
	" if len(overlap):\n",
	" data[kmer] = overlap\n",
	" \n",
	" output = []\n",
	" for key in sorted(data.keys()):\n",
	" output.append(\"\"+key+\" -> \"+','.join(data[key])+\"\")\n",
	" return output"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Download the sample dataset from this [link](http://bioinformaticsalgorithms.com/data/extradatasets/assembly/De_Bruijn_Graph_from_a_String.txt)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"dataset = '(3D)_De_Bruijn_Graph_from_a_String' #this path is to sample dataset from rosalind to test whether the program works or not before doing the test"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Its not matching to original output.\n",
	"[231] =====\n",
	"Output : ATGTCTACCGT -> TGTCTACCGTA,TGTCTACCGTC\n",
	"Actual_Output : ATGTCTACCGT -> TGTCTACCGTA\n",
	"[331] =====\n",
	"Output : CCACATTACTA -> CACATTACTAC\n",
	"Actual_Output : CCACATTACTA -> CACATTACTAC,CACATTACTAC\n",
	"[718] =====\n",
	"Output : GTGTCTACCGT -> TGTCTACCGTA,TGTCTACCGTC\n",
	"Actual_Output : GTGTCTACCGT -> TGTCTACCGTC\n"
	]
	}
	],
	"source": [
	"with open('Dataset_And_Answers\\\\'+dataset+'.txt') as file:\n",
	" data = [line.rstrip() for line in file]\n",
	"\n",
	"kmer_size = int(data[1])\n",
	"Genome = data[2]\n",
	"small_reads = [Genome[index:(index+kmer_size-1)] for index in range(len(Genome)-kmer_size+2)]\n",
	"Actual_Output = data[4:]\n",
	"\n",
	"output = DeBruijn(small_reads)\n",
	"\n",
	"if output == Actual_Output:\n",
	" print('Output matches and program works.')\n",
	"else:\n",
	" print('Its not matching to original output.')\n",
	" for (index,(a,b)) in enumerate(zip(output,Actual_Output),0):\n",
	" if a != b:\n",
	" print(f'[{index}] =====')\n",
	" print('Output :',a)\n",
	" print('Actual_Output :',b)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Working with test dataset."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Output Generated\n"
	]
	}
	],
	"source": [
	"dataset = 'rosalind_ba3d' # this is a test dataset\n",
	"with open('Dataset_And_Answers\\\\'+dataset+'.txt') as file:\n",
	" data = [line.rstrip() for line in file]\n",
	"\n",
	"kmer_size = int(data[0])\n",
	"Genome = data[1]\n",
	"small_reads = [Genome[index:(index+kmer_size-1)] for index in range(len(Genome)-kmer_size+2)]\n",
	"\n",
	"output = DeBruijn(small_reads)\n",
	"\n",
	"# storing the final output into a file to upload it to rosalind\n",
	"with open('Dataset_And_Answers\\\\'+dataset+'_answer.txt','w') as file: \n",
	" for line in output:\n",
	" file.write(line+'\\n')\n",
	"\n",
	"print('Output Generated')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "BioPython",
	"language": "python",
	"name": "biopython"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}