Created
September 20, 2021 22:02
-
-
Save simecek/8ba4163547c929c53a509d975b85936e to your computer and use it in GitHub Desktop.
Parsing FASTA with BioPython
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import gzip\n", | |
"import re\n", | |
"from tqdm.notebook import tqdm\n", | |
"\n", | |
"from Bio import SeqIO" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get list of intervals\n", | |
"\n", | |
"CSV tables with intervals can be found at https://github.com/ML-Bioinfo-CEITEC/genomic_benchmarks/tree/main/datasets/demo_coding_vs_intergenomic_seqs/train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>region</th>\n", | |
" <th>start</th>\n", | |
" <th>end</th>\n", | |
" <th>strand</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>27434</td>\n", | |
" <td>chr2</td>\n", | |
" <td>56478034</td>\n", | |
" <td>56478234</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>13400</td>\n", | |
" <td>chr21</td>\n", | |
" <td>20427531</td>\n", | |
" <td>20427731</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>883</td>\n", | |
" <td>chr17</td>\n", | |
" <td>73975959</td>\n", | |
" <td>73976159</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>7303</td>\n", | |
" <td>chr13</td>\n", | |
" <td>45730273</td>\n", | |
" <td>45730473</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>45124</td>\n", | |
" <td>chr10</td>\n", | |
" <td>38820750</td>\n", | |
" <td>38820950</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id region start end strand\n", | |
"0 27434 chr2 56478034 56478234 +\n", | |
"1 13400 chr21 20427531 20427731 +\n", | |
"2 883 chr17 73975959 73976159 +\n", | |
"3 7303 chr13 45730273 45730473 +\n", | |
"4 45124 chr10 38820750 38820950 +" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_csv(\"../../datasets/demo_coding_vs_intergenomic_seqs/train/intergenomic_seqs.csv\")\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>region</th>\n", | |
" <th>start</th>\n", | |
" <th>end</th>\n", | |
" <th>strand</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>27434</td>\n", | |
" <td>ENST00000217185</td>\n", | |
" <td>203</td>\n", | |
" <td>403</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>13400</td>\n", | |
" <td>ENST00000367051</td>\n", | |
" <td>4701</td>\n", | |
" <td>4901</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>883</td>\n", | |
" <td>ENST00000360507</td>\n", | |
" <td>623</td>\n", | |
" <td>823</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>7303</td>\n", | |
" <td>ENST00000489432</td>\n", | |
" <td>1469</td>\n", | |
" <td>1669</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>45124</td>\n", | |
" <td>ENST00000309668</td>\n", | |
" <td>64</td>\n", | |
" <td>264</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id region start end strand\n", | |
"0 27434 ENST00000217185 203 403 +\n", | |
"1 13400 ENST00000367051 4701 4901 +\n", | |
"2 883 ENST00000360507 623 823 +\n", | |
"3 7303 ENST00000489432 1469 1669 +\n", | |
"4 45124 ENST00000309668 64 264 +" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df2 = pd.read_csv(\"../../datasets/demo_coding_vs_intergenomic_seqs/train/coding_seqs.csv\")\n", | |
"df2.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get genome and transcriptome\n", | |
"\n", | |
"We are using release 97 of Ensembl (Human): http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"GENOME_PATH = \"../../refs/Homo_sapiens.GRCh38.dna.toplevel.fa.gz\"\n", | |
"TRANSCRIPTOME_PATH = \"../../refs/Homo_sapiens.GRCh38.cdna.all.fa.gz\"\n", | |
"\n", | |
"def fasta2dict(fasta_past, fasta_total=None, stop_id=None, region_name_transform=lambda x: x):\n", | |
" fasta = dict()\n", | |
" \n", | |
" with gzip.open(fasta_past, \"rt\") as handle:\n", | |
" for record in tqdm(SeqIO.parse(handle, \"fasta\"), total=fasta_total):\n", | |
" fasta[region_name_transform(record.id)] = str(record.seq)\n", | |
" \n", | |
" if stop_id and (record.id == stop_id): \n", | |
" # stop, do not read small contigs\n", | |
" break\n", | |
" \n", | |
" return fasta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "9c7500bfb47145d29df2dc861d4e93f5", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/190000 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 6.44 s, sys: 137 ms, total: 6.58 s\n", | |
"Wall time: 6.54 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"transcriptome = fasta2dict(TRANSCRIPTOME_PATH, 190_000, region_name_transform=lambda x: re.sub(\"ENST([0-9]*)[.][0-9]*\", \"ENST\\\\1\", x))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "4b0dac41d80d4b60bdf2654209072298", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/24 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 35 s, sys: 3.48 s, total: 38.5 s\n", | |
"Wall time: 38.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"genome = fasta2dict(GENOME_PATH, 24, \"MT\", region_name_transform=lambda x: \"chr\"+x)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# all interval regions found?\n", | |
"all([region in genome for region in df['region']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all([t in transcriptome for t in df2['region']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Find sequences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def seq_finder(ref, region, start, end):\n", | |
" return ref[region][start:end]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'AGAAATTTGTCTATTTCATCTAAGTTATCAAATTTCTTGTTGTACAGTTTATAGTACTCCTTTTTAATCCTTTTTATTTCCATGAGATCAAAAATGACATCTTTCTTTCACTTCTGACTTTAGTAATATGCATTCTCTCTCTCTCTTATTAGTCAGTTTAGGTAAAGGTTTTTCAATTTTGCTGACCTTTTCAAAGATCT'" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"seq_finder(genome, df['region'].iloc[0], df['start'].iloc[0], df['end'].iloc[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def seq_finder_tab(ref, tab):\n", | |
" return pd.Series([ref[region][start:end] for region, start, end in zip(tab['region'], tab['start'], tab['end'])])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 28.3 ms, sys: 4.03 ms, total: 32.3 ms\n", | |
"Wall time: 31.4 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>region</th>\n", | |
" <th>start</th>\n", | |
" <th>end</th>\n", | |
" <th>strand</th>\n", | |
" <th>seq</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>27434</td>\n", | |
" <td>chr2</td>\n", | |
" <td>56478034</td>\n", | |
" <td>56478234</td>\n", | |
" <td>+</td>\n", | |
" <td>AGAAATTTGTCTATTTCATCTAAGTTATCAAATTTCTTGTTGTACA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>13400</td>\n", | |
" <td>chr21</td>\n", | |
" <td>20427531</td>\n", | |
" <td>20427731</td>\n", | |
" <td>+</td>\n", | |
" <td>GAGAGAGGAAGCAAGAGAGAGACGCAAATGCTGCCAGAACTTTTTA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>883</td>\n", | |
" <td>chr17</td>\n", | |
" <td>73975959</td>\n", | |
" <td>73976159</td>\n", | |
" <td>+</td>\n", | |
" <td>TCCTTCCCAAGAGGTCTTCTGTGAGCCCACCCAACGAAGTGATGGT...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>7303</td>\n", | |
" <td>chr13</td>\n", | |
" <td>45730273</td>\n", | |
" <td>45730473</td>\n", | |
" <td>+</td>\n", | |
" <td>CTCAGGCTGCTTCTCCAAGTCAGTGCAGAAGTTAAGGGGAGCACCT...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>45124</td>\n", | |
" <td>chr10</td>\n", | |
" <td>38820750</td>\n", | |
" <td>38820950</td>\n", | |
" <td>+</td>\n", | |
" <td>ATAGTATGGAATGGAATCGAATGGAATTGAATCGAATAGAATTGGC...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id region start end strand \\\n", | |
"0 27434 chr2 56478034 56478234 + \n", | |
"1 13400 chr21 20427531 20427731 + \n", | |
"2 883 chr17 73975959 73976159 + \n", | |
"3 7303 chr13 45730273 45730473 + \n", | |
"4 45124 chr10 38820750 38820950 + \n", | |
"\n", | |
" seq \n", | |
"0 AGAAATTTGTCTATTTCATCTAAGTTATCAAATTTCTTGTTGTACA... \n", | |
"1 GAGAGAGGAAGCAAGAGAGAGACGCAAATGCTGCCAGAACTTTTTA... \n", | |
"2 TCCTTCCCAAGAGGTCTTCTGTGAGCCCACCCAACGAAGTGATGGT... \n", | |
"3 CTCAGGCTGCTTCTCCAAGTCAGTGCAGAAGTTAAGGGGAGCACCT... \n", | |
"4 ATAGTATGGAATGGAATCGAATGGAATTGAATCGAATAGAATTGGC... " | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df['seq'] = seq_finder_tab(genome, df)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 46.6 ms, sys: 276 µs, total: 46.9 ms\n", | |
"Wall time: 45.8 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>region</th>\n", | |
" <th>start</th>\n", | |
" <th>end</th>\n", | |
" <th>strand</th>\n", | |
" <th>seq</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>27434</td>\n", | |
" <td>ENST00000217185</td>\n", | |
" <td>203</td>\n", | |
" <td>403</td>\n", | |
" <td>+</td>\n", | |
" <td>CTGGACGAGGCGGGTGGGGCCGTGGCCCAGGGCTATGTGCCCCACA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>13400</td>\n", | |
" <td>ENST00000367051</td>\n", | |
" <td>4701</td>\n", | |
" <td>4901</td>\n", | |
" <td>+</td>\n", | |
" <td>GCTTGTGGGAGAACGGTCAATATATTGCACCAGCAAAGATGATCAA...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>883</td>\n", | |
" <td>ENST00000360507</td>\n", | |
" <td>623</td>\n", | |
" <td>823</td>\n", | |
" <td>+</td>\n", | |
" <td>CTGACCCTCGGGAGATCTTTGAGGTGCTGAGCTGGTTGGAGAGCTG...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>7303</td>\n", | |
" <td>ENST00000489432</td>\n", | |
" <td>1469</td>\n", | |
" <td>1669</td>\n", | |
" <td>+</td>\n", | |
" <td>CTCCCTCTACAGCTGCGACGACTGCGGCAGGAGCTTCCGGCTGGAG...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>45124</td>\n", | |
" <td>ENST00000309668</td>\n", | |
" <td>64</td>\n", | |
" <td>264</td>\n", | |
" <td>+</td>\n", | |
" <td>CCCTGAGTCTGTATTGCTCAAGAAGGGCCTTCCCCAGCAATGACCT...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id region start end strand \\\n", | |
"0 27434 ENST00000217185 203 403 + \n", | |
"1 13400 ENST00000367051 4701 4901 + \n", | |
"2 883 ENST00000360507 623 823 + \n", | |
"3 7303 ENST00000489432 1469 1669 + \n", | |
"4 45124 ENST00000309668 64 264 + \n", | |
"\n", | |
" seq \n", | |
"0 CTGGACGAGGCGGGTGGGGCCGTGGCCCAGGGCTATGTGCCCCACA... \n", | |
"1 GCTTGTGGGAGAACGGTCAATATATTGCACCAGCAAAGATGATCAA... \n", | |
"2 CTGACCCTCGGGAGATCTTTGAGGTGCTGAGCTGGTTGGAGAGCTG... \n", | |
"3 CTCCCTCTACAGCTGCGACGACTGCGGCAGGAGCTTCCGGCTGGAG... \n", | |
"4 CCCTGAGTCTGTATTGCTCAAGAAGGGCCTTCCCCAGCAATGACCT... " | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df2['seq'] = seq_finder_tab(transcriptome, df2)\n", | |
"df2.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment