Last active
July 24, 2017 05:31
-
-
Save olgabot/f167dd32fc21fd1ac3a98ad0cb7815a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/Users/olgabot/projects/dash\n" | |
] | |
} | |
], | |
"source": [ | |
"cd ~/projects/dash/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"mm10-plus.tgz\r\n" | |
] | |
} | |
], | |
"source": [ | |
"ls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"x mm10-plus/\n", | |
"x mm10-plus/mm10-plus.gtf\n", | |
"x mm10-plus/mm10-plus.fa\n" | |
] | |
} | |
], | |
"source": [ | |
"! tar xzvf mm10-plus.tgz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total 5605104\r\n", | |
"drwxr-xr-x 4 olgabot staff 136B Feb 16 16:48 \u001b[1m\u001b[36m.\u001b[m\u001b[m/\r\n", | |
"drwxr-xr-x 5 olgabot staff 170B Jul 23 20:20 \u001b[1m\u001b[36m..\u001b[m\u001b[m/\r\n", | |
"-rw-r----- 1 olgabot staff 2.6G Feb 16 15:14 mm10-plus.fa\r\n", | |
"-rw-r----- 1 olgabot staff 80M Feb 16 15:14 mm10-plus.gtf\r\n" | |
] | |
} | |
], | |
"source": [ | |
"ls -lha mm10-plus" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"ERCC-00002\tERCC\texon\t1\t1061\t0.000000\t+\t.\tgene_id \"ERCC-00002\"; transcript_id \"DQ459430\";\r\n", | |
"ERCC-00003\tERCC\texon\t1\t1023\t0.000000\t+\t.\tgene_id \"ERCC-00003\"; transcript_id \"DQ516784\";\r\n", | |
"ERCC-00004\tERCC\texon\t1\t523\t0.000000\t+\t.\tgene_id \"ERCC-00004\"; transcript_id \"DQ516752\";\r\n", | |
"ERCC-00009\tERCC\texon\t1\t984\t0.000000\t+\t.\tgene_id \"ERCC-00009\"; transcript_id \"DQ668364\";\r\n", | |
"ERCC-00012\tERCC\texon\t1\t994\t0.000000\t+\t.\tgene_id \"ERCC-00012\"; transcript_id \"DQ883670\";\r\n", | |
"ERCC-00013\tERCC\texon\t1\t808\t0.000000\t+\t.\tgene_id \"ERCC-00013\"; transcript_id \"EF011062\";\r\n", | |
"ERCC-00014\tERCC\texon\t1\t1957\t0.000000\t+\t.\tgene_id \"ERCC-00014\"; transcript_id \"DQ875385\";\r\n", | |
"ERCC-00016\tERCC\texon\t1\t844\t0.000000\t+\t.\tgene_id \"ERCC-00016\"; transcript_id \"DQ883664\";\r\n", | |
"ERCC-00017\tERCC\texon\t1\t1136\t0.000000\t+\t.\tgene_id \"ERCC-00017\"; transcript_id \"DQ459420\";\r\n", | |
"ERCC-00019\tERCC\texon\t1\t644\t0.000000\t+\t.\tgene_id \"ERCC-00019\"; transcript_id \"DQ883651\";\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! grep ERCC mm10-plus/mm10-plus.gtf | head" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! grep ERCC mm10-plus/mm10-plus.gtf > ercc.gtf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"chr17\tunknown\texon\t39842997\t39848829\t.\t+\t.\tgene_id \"Rn45s\"; transcript_id \"NR_046233\"; gene_name \"Rn45s\"; tss_id \"TSS24678\";\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! grep Rn45s mm10-plus/mm10-plus.gtf | head" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"! grep Rn45s mm10-plus/mm10-plus.gtf > rn45s.gtf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 595782 mm10-plus/mm10-plus.gtf\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! wc -l mm10-plus/mm10-plus.gtf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 92 ercc.gtf\n", | |
" 595689 mm10-plus-without-rn45s-ercc.gtf\n", | |
" 1 rn45s.gtf\n", | |
" 595782 total\n" | |
] | |
} | |
], | |
"source": [ | |
"! wc -l *.gtf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 595689\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! grep -v -E \"ERCC|Rn45s\" mm10-plus/mm10-plus.gtf | wc -l" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Check that all the lines add up to the original length" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"595782" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"595689 + 1 + 92" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"! grep -v -E \"ERCC|Rn45s\" mm10-plus/mm10-plus.gtf > mm10-plus-without-rn45s-ercc.gtf" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Use `bedtools` to extract fasta sequences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\r\n", | |
"Tool: bedtools getfasta (aka fastaFromBed)\r\n", | |
"Version: v2.26.0\r\n", | |
"Summary: Extract DNA sequences from a fasta file based on feature coordinates.\r\n", | |
"\r\n", | |
"Usage: bedtools getfasta [OPTIONS] -fi <fasta> -bed <bed/gff/vcf>\r\n", | |
"\r\n", | |
"Options: \r\n", | |
"\t-fi\tInput FASTA file\r\n", | |
"\t-bed\tBED/GFF/VCF file of ranges to extract from -fi\r\n", | |
"\t-name\tUse the name field for the FASTA header\r\n", | |
"\t-split\tgiven BED12 fmt., extract and concatenate the sequencesfrom the BED \"blocks\" (e.g., exons)\r\n", | |
"\t-tab\tWrite output in TAB delimited format.\r\n", | |
"\t\t- Default is FASTA format.\r\n", | |
"\r\n", | |
"\t-s\tForce strandedness. If the feature occupies the antisense,\r\n", | |
"\t\tstrand, the sequence will be reverse complemented.\r\n", | |
"\t\t- By default, strand information is ignored.\r\n", | |
"\r\n", | |
"\t-fullHeader\tUse full fasta header.\r\n", | |
"\t\t- By default, only the word before the first space or tab is used.\r\n", | |
"\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! bedtools getfasta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
">exon::ERCC-00002:0-1061(+)\r\n", | |
"TCCAGATTACTTCCATTTCCGCCCAAGCTGCTCACAGTATACGGGCGTCGGCATCCAGACCGTCGGCTGATCGTGGTTTTACTAGGCTAGACTAGCGTACGAGCACTATGGTCAGTAATTCCTGGAGGAATAGGTACCAAGAAAAAAACGAACCTTTGGGTTCCAGAGCTGTACGGTCGCACTGAACTCGGATAGGTCTCAGAAAAACGAAATATAGGCTTACGGTAGGTCCGAATGGCACAAAGCTTGTTCCGTTAGCTGGCATAAGATTCCATGCCTAGATGTGATACACGTTTCTGGAAACTGCCTCGTCATGCGACTGTTCCCCGGGGTCAGGGCCGCTGGTATTTGCTGTAAAGAGGGGCGTTGAGTCCGTCCGACTTCACTGCCCCCTTTCAGCCTTTTGGGTCCTGTATCCCAATTCTCAGAGGTCCCGCCGTACGCTGAGGACCACCTGAAACGGGCATCGTCGCTCTTCGTTGTTCGTCGACTTCTAGTGTGGAGACGAATTGCCAGAATTATTAACTGCGCAGTTAGGGCAGCGTCTGAGGAAGTTTGCTGCGGTTTCGCCTTGACCGCGGGAAGGAGACATAACGATAGCGACTCTGTCTCAGGGGATCTGCATATGTTTGCAGCATACTTTAGGTGGGCCTTGGCTTCCTTCCGCAGTCAAAACCGCGCAATTATCCCCGTCCTGATTTACTGGACTCGCAACGTGGGTCCATCAGTTGTCCGTATACCAAGACGTCTAAGGGCGGTGTACACCCTTTTGAGCAATGATTGCACAACCTGCGATCACCTTATACAGAATTATCAATCAAGCTCCCCGAGGAGCGGACTTGTAAGGACCGCCGCTTTCGCTCGGGTCTGCGGGTTATAGCTTTTCAGTCTCGACGGGCTAGCACACATCTGGTTGACTAGGCGCATAGTCGCCATTCACAGATTTGCTCGGCAATCAGTACTGGTAGGCGTTAGACCCCGTGACTCGTGGCTGAACGGCCGTACAACTCGACAGCCGGTGCTTGCGTTTTACCCTTAAAAAAAAAAAAAAAAAAAAAAAA\r\n", | |
">exon::ERCC-00003:0-1023(+)\r\n", | |
"CAGCAGCGATTAAGGCAGAGGCGTTTGTATCTGCCATTATAAAGAAGTTTCCTCCAGCAACTCCTTTCTTAATTCCAAACTTAGCTTCAGTTATAAATTCCCCTCCCATGATTGGGATTTTATAAACTTTTCTTCCATATAATTCATCTTTCTTCTCATAACCGTCTCCGAAAAACTTCAACTTAAATCCAACCTTTAACTGCTCATCAGCCATGTCTCCCACAGCATCAAAAATAGCAGTTGTTGGACATGTTAAGACACACTGCCCCAATCTCTCTAACATTTGATGCTCTAACTCTGACTTTTTAGGGTGGCATATCTGTATTATAAATCCTGGTCTTCCATCTGGTGTTTTTGATGGAGGGACATATTTCTCAATTCCTGCTTCTGCTGGACACATTATAACTGAACAACCAAAACCTGTTGCCTCTGTAGCTGCAATCTTAGCCCACTTCTTTGTAGCTGCTGTTATTAAAACTCTTGAAACCCATATTGGGAATGCTTCTGCAAATGTATCTTCAATATATACTCCATTTATTTCCATAGTTTCCCTCCATTAAGATTTTAACAATTATAGTTTATCTTAGGGGCTATTAATATCTTATCATTTGGTTTTTAATATTCGATAAATCCATAAATAAAAATATATCAACAATAATTTTAAATAATCTAAGTATAGGTAATATAACAATTAAAAAGATTTAGAGGGATAGAATTGAACGGCATTAGGAGAATTGTTTTAGATATATTGAAGCCGCATGAGCCAAAAATAACAGATATGGCATTAAAATTAACATCATTATCAAACATTGATGGGGTTAATATTACAGTCTATGAAATAGATAAAGAGACTGAGAATGTTAAAGTTACAATTGAAGGGAATAATTTAGATTTTGATGAGATTCAGGAAATTATTGAAAGTTTGGGAGGGACTATTCACAGTATAGATGAGGTTGTTGCAGGTAAAAAGATTATTGAAGAGTTAGAACACCACAAGATAAAAAAAAAAAAAAAAAAAAAAAA\r\n", | |
">exon::ERCC-00004:0-523(+)\r\n", | |
"TCTTGCTTCAACAATAACGTCTCTTTCAGAAGGCATTGGTATCTTTTCCCCACTTCCAAGCATTTTTTCAACTAATCTTATGTTATTAACCATTTCCTTAAATTCTTCTGGGTCTGCTGACAAAGCATGATCAGGACCTTCCATATTTTTATCTAAGGTAAAGTGCTTCTCAATAACATCCGCTCCTAAGGCAACAGAAACTACTGGGGCGAGTATTCCCAATGTATGGTCAGAATATCCCACAGGGATATTGAATATACTTTTCAAGGTTTTAATAGCGTTTAAATTGACATCTTCATAAGGGGTTGGGTAAGATGAAATACAATGCAATAAAATAATATCCCTGCATCCATTATTTTCTAAAACTTTAACTGCTTCCCAAATTTCCCCAATATCAGACATTCCTGTAGATAAAATCACCGGCTTGCCTGTTTTTGCCACTTTTTCTAATAAGGGATAAAAGGTTAAATCACCAGAGGCAATTTTAAATCAGGCACATAAAAAAAAAAAAAAAAAAAAAAAA\r\n", | |
">exon::ERCC-00009:0-984(+)\r\n", | |
"CAATGATAGGCTAGTCTCGCGCAGTACATGGTAGTTCAGCCAATAGATGCCTAGTACGCTGACGGCATTCAGAGTACGCTGATCGGCTTATGACGTATGTGACGCAGCTCTTAGCGCAATGTATGTGCTGTTATCGAAGCCTATGGCTGAGTATGTAACGCTATGGCGTGCTAGTCGTCTCATATACGTCTGATGACCTCGTATCATGTTATAGGGCTGCGAACTGTCGATGATGGTCACGACTCTGTCGATAGCTGTGTGACTCATTCAGAAGGTGTGCAGCCTATATGATACGCAGTCGCATCCTATCTTACGTGTCAGTACTATGTGTGAGTGCTCCGCCCTAGTGCTGATGTATGCCCCATAGTGCTCAGTGGAGTCTCTCTTAGCATAGTGTCCGCTCATACATTAGATGGACGGCTCATTAGTATCATCGTCGGCTGATATAGGTCGTGGCTCCCTGTATATCGAGGTGAGTCTATCTGGATCAACGTCGCACTATGATGTGCAAAGTGTCGTCCATGTATAGACAGTGCGCGTATCATATAGGATGCGGCGATCTCATACAGCGTTACGGTCGCTGCGTACTGTATAAGGATGCTCTGTGAACTGTCATCGGTCCGATCAATTAGTCTAGTGTGCGTTATTCAGATCGAGTGAGTACATGATTCGTCAGTGTGGATCAATTACAGTTAGGCCGCTGACACATTAGTAACGTCGGCAAGCACTTAGTCGTGTCGTAAGCCAGTGTGTCGTGTCTTAGACGACTGTGTGTGATTCTCGAGCGATTTATACATCCGTGACAGCGCTTATAGTGTGCTGACAGACTGGTTGGTTATCCAATGATCGACCTGGAGTCTAATATCTGACCACGCCTTGTAATCGTATGACACGCGCTTGACACGACTGAATCCAGCTTAAGAGCCCTGCAACGCGATATACAGGCGCTGCTACCGATATAAAAAAAAAAAAAAAAAAAAAAAA\r\n", | |
">exon::ERCC-00012:0-994(+)\r\n", | |
"CGAGAGATGTTTGTAGGTGCGGAATGTGTGCGGTCTACCTTAGCTGTAGTGTGCGATGAACCTACACACAACGTGGTATAGTGGCCGATCTTAGAGTGATCCTATCACTCCTTACGCACCAGAAGGGATCTGCATACCAGGCGGAGAACTTGGAAGGCGGCTAGATCACTGAATTGCGGGAATCGGCATTTCGCATTCTTAGGATCTAAACCTTAGACCTCCGCGTGCGATTGCACCTGCTTGGTACAGAGTTACAAGCCCCCCGCACTTTCTTTGCGGTCGTTAAGAGGGAAATCGCCCAATTAGCAGAGTGTCAGGTGTTACGCGCGATTGAGCCGTCAGAAGAATCGATAGAGCCGCGTCGGGACCTTGATGGTATCTCTGCCTCAGCTAACCTGCTAGGTCCGTCCCCTGGGGATGATCAGGACTGCGGATAGTAAATTGCGGGTTTGAAGCCGGACTTGCCGCCTAGGCAAAGCACAAAAACATCGGACATGTAGAAGTCTCATCGAACTCCTTTCCCGTTCATGCAGATACTTCAACTGTGACTAGTGGGGTTCGGGAGCACCCGCACTACTTCATTCTTGGCGGTGGGCCACTTTATGTGACTGTACATGGGACTTCTACTCATACCAATGTAAAGTATAGTTAACGCCCTGTCCACTCTACTCAGGCGTAATCATCGCGGAAGGCTATCCACAGCCCATCAGCGGTCTACATGTCCCAGCAGATTCACCTGTCCTGCGGGTCCGCGTCACAGCCTATTCTGAGGCTCTAAAGACTATGCGAACCAGGTGTCCCAGTCGATCAGACGACGAAGTCGGGAAGGAAGCATGGATACCAAAAAGGCTTTATATACTGGGTTATCCTAGGGGATGTTTTTACCGGACTGGTCAGCCTCGGTGCGCTCGGCCTAGGCGCTTACTGCATGGGGGCTGTGGGCAATTTGGTATTTCTCAGGACTATGGACAAAAAAAAAAAAAAAAAAAAAAAA\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! bedtools getfasta -name -s -fi mm10-plus/mm10-plus.fa -bed ercc.gtf | head" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! bedtools getfasta -name -s -fi mm10-plus/mm10-plus.fa -bed ercc.gtf > ercc.fa" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"! bedtools getfasta -name -s -fi mm10-plus/mm10-plus.fa -bed rn45s.gtf > rn45s.fa" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
">exon::chr1:3214481-3216968(-)\r\n", | |
"GTTTCACAGCAGCAGCCTCCCTTGTGTCCTTGGCTTGGGCCCTAGCCTCCTACCAGAAGGCTCTTCGGGACTCCCGAGATGACAAAAAGCCCATCAGCTACATGGCTGTCATCATTCAGTTCTGCTGGCATTTCTTCACCATCGCTGCCAGGGTCATCACATTCGCCCTCTTTGCCTCGGTTTTCCAGCTGTATTTTGGGATATTTATTGTCCTCCATTGGTGCATCATGACTTTCTGGATTGTCCACTGTGAGACAGAATTCTGTATCACCAAATGGGAAGAGATTGTGTTTGACATGGTGGTGGGCATCATCTACATCTTCAGTTGGTTCAATGTCAAGGAAGGCAGGACACGCTGCAGGCTGTTCATTTACTATTTTGTAATCCTTTTGGAAAATACAGCCTTGAGTGCACTCTGGTACCTCTACAAAGCTCCCCAGATTGCAGATGCATTTGCCATCCCTGCATTGTGCGTGGTTTTCAGCAGCTTTTTAACAGGTGTTGTTTTTATGCTGATGTACTATGCCTTCTTTCATCCCAATGGGCCCAGATTTGGGCAATCACCAAGTTGTGCTTGTGATGATCCAGCCACTGCCTTCTCTCTGCCTCCAGAAGTAGCCACAAGCACACTACGGTCCATCTCCAACAACCGCAGTGTTGCCAGTGACCGTGATCAGAAATTTGCAGAGCGGGATGGATGTGTACCTGTGTTTCAAGTGAGACCAACTGCACCACCCACCCCATCATCTCGACCACCACGGATTGAAGAATCAGTCATTAAAATTGACCTGTTCAGGAATAGATATCCAGCATGGGAGAGACATGTGTTAGATCGAAGCCTGAGAAAGGCCATTTTAGCCTTTGAATGTTCCCCATCTCCTCCAAGGCTGCAGTACAAGGATGATGCCCTTATTCAGGAGAGGCTGGAATATGAAACCACTTTATAAAATACAAGGAGCCGCAATGTCCACATGAAGGGGTAACAGCAGGGCTGTGGCAATAATGACACCTTATCCAAGAGTAGGGCAGCGAGCTGTATGTTCTTAGTTGTGGTATGGTTTGATCTTCCATCAGCTGACTGCCTGCTGCTGGTGTCTATTCAAGCCAGCAGTGCTGAGAGTCTCTTACACTGTCAGCTTAATATGACTGTTGCTACAAACTCCTCCAGCAGAGATTTGGGGCACATTCACTGGAGGATAACATTATTGTGAAAAATGTTGCCTCTAATCATTAGGGTATTTTGATGGGTTTTACTAAGTTTTGCATAAATATATTCACACACCACCATACCACCCCTCAATCAAAGGAGTTAAGGTggggatggagagatgactcattagttaagagcactgactgctcttgcaaaggacccaggcttgagtagttcactgcaactctaattccagaagatctaatgtccatttttggcctcctcaagcactgcacacacatggtgcatagacatatatgcaggcaaaatacccatacacatagcataaaaataaaTCTCAAAGAAAAAAAGCTTAGGTGATTTCCTTGATGCAAAGCTCACAACATACTCCAGGAAGAAAGCAGCATACTTGGGACAATTATATAAACTGTTCTCTCCTTTGCAAACCAGTAGCATCAATGAAGTGGACAGCAAGACTCAAGTGTTTACACTCGTACTAACTAGCTTTGATGGGATGATTCTTTTTCTACATATTTCAGGATTTGTTTTTACTTTTAGGTTTTGCAGATGAGAACATTCTTCATGACAGAAATCCTATGCAGCACTTATATGGCTTTTGATGAGACCAAGGAGCTCAATATCTGTAATGTAAATTAAATGCTAATCATAATTCAGTATTCAGTTGCAAAAATACAATATATAAAAAGAGTCTTTGGGGAAGGGACAGAGTGAGATTCAGATTCTCAGGTGTGTGCATCTTATATTGGAATGCACCCACAGAGCCACAGGAGAGGAACAGGGACTATTTCAAGGTCTGTGTTCATGTCTGTTTCCAGAACTGTTTCCAGGTGCAGAATGACATGGGTCAGCAGGTATGATTCCGGAAACCACGTGCCACATCTTTCGAGTGCCAAATTTTGTCCAATTACAGAACTGATATGGAATCCCCAAAATCTGAGAATAAGTGGTTTCCCAAAACAGACAAAAGAAGAATAATCAGGTTCCCTGCTGTGTACAGACTTACCCTCTTCCCATCCAAGGTCAAAATGATGTGTCTACTAGAGACTTTGGGACACAATTTAGCAAGTGAGAGCATACAGATGCAATGTGTATGCCATTAAAAATACTGCCTGGACTGCTTGAGGGCTTACCACTCCATCAGCTAAGATTTGTATTTGAATCATCTGTAAATTCGTGCTCTTACAAGCTTCTGAGTTTTAAATACCTCCACACAGCAAGTAAACATTCCCGCTTTCTGTTTTCGGTGTCCTTGGTCATGGTGCTTTTTGTTGCATTAAAAGTGCCGGTCAAACTTT\r\n", | |
">stop_codon::chr1:3216021-3216024(-)\r\n", | |
"TAA\r\n", | |
">CDS::chr1:3216024-3216968(-)\r\n", | |
"GTTTCACAGCAGCAGCCTCCCTTGTGTCCTTGGCTTGGGCCCTAGCCTCCTACCAGAAGGCTCTTCGGGACTCCCGAGATGACAAAAAGCCCATCAGCTACATGGCTGTCATCATTCAGTTCTGCTGGCATTTCTTCACCATCGCTGCCAGGGTCATCACATTCGCCCTCTTTGCCTCGGTTTTCCAGCTGTATTTTGGGATATTTATTGTCCTCCATTGGTGCATCATGACTTTCTGGATTGTCCACTGTGAGACAGAATTCTGTATCACCAAATGGGAAGAGATTGTGTTTGACATGGTGGTGGGCATCATCTACATCTTCAGTTGGTTCAATGTCAAGGAAGGCAGGACACGCTGCAGGCTGTTCATTTACTATTTTGTAATCCTTTTGGAAAATACAGCCTTGAGTGCACTCTGGTACCTCTACAAAGCTCCCCAGATTGCAGATGCATTTGCCATCCCTGCATTGTGCGTGGTTTTCAGCAGCTTTTTAACAGGTGTTGTTTTTATGCTGATGTACTATGCCTTCTTTCATCCCAATGGGCCCAGATTTGGGCAATCACCAAGTTGTGCTTGTGATGATCCAGCCACTGCCTTCTCTCTGCCTCCAGAAGTAGCCACAAGCACACTACGGTCCATCTCCAACAACCGCAGTGTTGCCAGTGACCGTGATCAGAAATTTGCAGAGCGGGATGGATGTGTACCTGTGTTTCAAGTGAGACCAACTGCACCACCCACCCCATCATCTCGACCACCACGGATTGAAGAATCAGTCATTAAAATTGACCTGTTCAGGAATAGATATCCAGCATGGGAGAGACATGTGTTAGATCGAAGCCTGAGAAAGGCCATTTTAGCCTTTGAATGTTCCCCATCTCCTCCAAGGCTGCAGTACAAGGATGATGCCCTTATTCAGGAGAGGCTGGAATATGAAACCACTTTA\r\n", | |
">CDS::chr1:3421701-3421901(-)\r\n", | |
"GTATTTGCACACAATATACTTAGGTATCCGGAGCCGGCAGAGTGGGGAGAGCGGCAGGTGGCGGTTTTACTGGAAGATGGTGTACGAGTATGCAGATGTGAGCATGCTGCATCTGCTAGCCACTTTTCTGGAAAGTGCTCCACAATTGGTCCTGCAGCTCTGCATTATTGTACAGACTCACAGCTTACAGGCCCTCCAAG\r\n", | |
">exon::chr1:3421701-3421901(-)\r\n", | |
"GTATTTGCACACAATATACTTAGGTATCCGGAGCCGGCAGAGTGGGGAGAGCGGCAGGTGGCGGTTTTACTGGAAGATGGTGTACGAGTATGCAGATGTGAGCATGCTGCATCTGCTAGCCACTTTTCTGGAAAGTGCTCCACAATTGGTCCTGCAGCTCTGCATTATTGTACAGACTCACAGCTTACAGGCCCTCCAAG\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! bedtools getfasta -name -fullHeader -s -fi mm10-plus/mm10-plus.fa -bed mm10-plus-without-rn45s-ercc.gtf | head" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Okay that's not working ... time to try something else\n", | |
"\n", | |
"All the CDS/exons aren't glued together as transcripts but they need to be glued together as transcripts" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Use `gffread` from cufflinks\n", | |
"\n", | |
"Outside of the notebook, installed `cufflinks` as suggested [here](http://onetipperday.sterding.com/2013/01/quick-ways-to-extract-mrnatranscript.html):\n", | |
"\n", | |
"```\n", | |
"➜ ~ conda create --yes -n cufflinks cufflinks\n", | |
"Fetching package metadata ...............\n", | |
"Solving package specifications: .\n", | |
"\n", | |
"Package plan for installation in environment /Users/olgabot/anaconda3/envs/cufflinks:\n", | |
"\n", | |
"The following NEW packages will be INSTALLED:\n", | |
"\n", | |
" cufflinks: 2.2.1-py36_1 bioconda\n", | |
" openssl: 1.0.2l-0 \n", | |
" pip: 9.0.1-py36_1 \n", | |
" python: 3.6.2-0 \n", | |
" readline: 6.2-2 \n", | |
" setuptools: 27.2.0-py36_0 \n", | |
" sqlite: 3.13.0-0 \n", | |
" tk: 8.5.18-0 \n", | |
" wheel: 0.29.0-py36_0 \n", | |
" xz: 5.2.2-1 \n", | |
" zlib: 1.2.8-3 \n", | |
"\n", | |
"python-3.6.2-0 100% |###########################################################################################################################################################################################| Time: 0:00:00 13.28 MB/s\n", | |
"cufflinks-2.2. 100% |###########################################################################################################################################################################################| Time: 0:00:01 3.55 MB/s\n", | |
"#\n", | |
"# To activate this environment, use:\n", | |
"# > source activate cufflinks\n", | |
"#\n", | |
"# To deactivate an active environment, use:\n", | |
"# > source deactivate\n", | |
"#\n", | |
"\n", | |
"➜ ~ source activate cufflinks\n", | |
"(cufflinks) ➜ ~ cd projects/dash \n", | |
"```\n", | |
"\n", | |
"Get help for `gffutils -w` which will write the spliced exons to a file:\n", | |
"\n", | |
"```\n", | |
"(cufflinks) ➜ ~ gffread -w\n", | |
"Usage:\n", | |
"gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n", | |
" [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n", | |
" [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n", | |
" [-i <maxintron>] \n", | |
" Filters and/or converts GFF3/GTF2 records.\n", | |
" <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n", | |
" \n", | |
" Options:\n", | |
" -g full path to a multi-fasta file with the genomic sequences\n", | |
" for all input mappings, OR a directory with single-fasta files\n", | |
" (one per genomic sequence, with file names matching sequence names)\n", | |
" -s <seq_info.fsize> is a tab-delimited file providing this info\n", | |
" for each of the mapped sequences:\n", | |
" <seq-name> <seq-length> <seq-description>\n", | |
" (useful for -A option with mRNA/EST/protein mappings)\n", | |
" -i discard transcripts having an intron larger than <maxintron>\n", | |
" -r only show transcripts overlapping coordinate range <start>..<end>\n", | |
" (on chromosome/contig <chr>, strand <strand> if provided)\n", | |
" -R for -r option, discard all transcripts that are not fully \n", | |
" contained within the given range\n", | |
" -U discard single-exon transcripts\n", | |
" -C coding only: discard mRNAs that have no CDS feature\n", | |
" -F full GFF attribute preservation (all attributes are shown)\n", | |
" -G only parse additional exon attributes from the first exon\n", | |
" and move them to the mRNA level (useful for GTF input)\n", | |
" -A use the description field from <seq_info.fsize> and add it\n", | |
" as the value for a 'descr' attribute to the GFF record\n", | |
" \n", | |
" -O process also non-transcript GFF records (by default non-transcript\n", | |
" records are ignored)\n", | |
" -V discard any mRNAs with CDS having in-frame stop codons\n", | |
" -H for -V option, check and adjust the starting CDS phase\n", | |
" if the original phase leads to a translation with an \n", | |
" in-frame stop codon\n", | |
" -B for -V option, single-exon transcripts are also checked on the\n", | |
" opposite strand\n", | |
" -N discard multi-exon mRNAs that have any intron with a non-canonical\n", | |
" splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n", | |
" -J discard any mRNAs that either lack initial START codon\n", | |
" or the terminal STOP codon, or have an in-frame stop codon\n", | |
" (only print mRNAs with a fulll, valid CDS)\n", | |
" --no-pseudo: filter out records matching the 'pseudo' keyword\n", | |
" \n", | |
" -M/--merge : cluster the input transcripts into loci, collapsing matching\n", | |
" transcripts (those with the same exact introns and fully contained)\n", | |
" -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n", | |
" --cluster-only: same as --merge but without collapsing matching transcripts\n", | |
" -K for -M option: also collapse shorter, fully contained transcripts\n", | |
" with fewer introns than the container\n", | |
" -Q for -M option, remove the containment restriction:\n", | |
" (multi-exon transcripts will be collapsed if just their introns match,\n", | |
" while single-exon transcripts can partially overlap (80%))\n", | |
" \n", | |
" --force-exons: make sure that the lowest level GFF features are printed as \n", | |
" \"exon\" features\n", | |
" -E expose (warn about) duplicate transcript IDs and other potential \n", | |
" problems with the given GFF/GTF records\n", | |
" -D decode url encoded characters within attributes\n", | |
" -Z merge close exons into a single exon (for intron size<4)\n", | |
" -w write a fasta file with spliced exons for each GFF transcript\n", | |
" -x write a fasta file with spliced CDS for each GFF transcript\n", | |
" -W for -w and -x options, also write for each fasta record the exon\n", | |
" coordinates projected onto the spliced sequence\n", | |
" -y write a protein fasta file with the translation of CDS for each record\n", | |
" -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n", | |
" -m <chr_replace> is a reference (genomic) sequence replacement table with\n", | |
" this format:\n", | |
" <original_ref_ID> <new_ref_ID>\n", | |
" GFF records on reference sequences that are not found among the\n", | |
" <original_ref_ID> entries in this file will be filtered out\n", | |
" -o the \"filtered\" GFF records will be written to <outfile.gff>\n", | |
" (use -o- for printing to stdout)\n", | |
" -t use <trackname> in the second column of each GFF output line\n", | |
" -T -o option will output GTF format instead of GFF3\n", | |
" \n", | |
"Error: value required for option '-w'\n", | |
"```\n", | |
"\n", | |
"\n", | |
"Here's the moneymaker command:\n", | |
"\n", | |
"```\n", | |
"gffread -w mm10-plus-without-rn45s-ercc.fa -g mm10-plus/mm10-plus.fa mm10-plus-without-rn45s-ercc.gtf\n", | |
"```\n", | |
"\n", | |
"\n", | |
"Check that the exons were spliced correctly and from the right strand by BLATing (genome.ucsc.edu > Tools > BLAT > paste sequence > \"I'm feeling lucky\") an example sequence in the genome. This one shows up at `chr1:3,214,482-3,671,498` and is on the negative strand, and is spliced. \n", | |
"\n", | |
"The upper/lowercase comes from the original `fasta`. It looks like its lowercase for repetitive sequence and uppercase for not.\n", | |
"\n", | |
"```\n", | |
">NM_001011874 gene=Xkr4 CDS=151-2091\n", | |
"gcggcggcgggcgagcgggcgctggagtaggagctggggagcggcgcggccggggaaggaagccagggcg\n", | |
"aggcgaggaggtggcgggaggaggagacagcagggacaggTGTCAGATAAAGGAGTGCTCTCCTCCGCTG\n", | |
"CCGAGGCATCATGGCCGCTAAGTCAGACGGGAGGCTGAAGATGAAGAAGAGCAGCGACGTGGCGTTCACC\n", | |
"CCGCTGCAGAACTCGGACAATTCGGGCTCTGTGCAAGGACTGGCTCCAGGCTTGCCGTCGGGGTCCGGAG\n", | |
"CCGAGGACAcggaggcggccggaggcggctgctgcccggacggcggtggctgctcgcgctgctgctgctg\n", | |
"ctgcgcggggagcggcggctcggcgggctcgggcggctcgggcggcggcggccggggcagcggggcggGC\n", | |
"TCTGCGGCGCTGTGCCTGCGCCTGGGCAGGGAGCAGCGGCGTTACTCGCTGTGGGACTGCCTCTGGATCC\n", | |
"TGGCCGCCGTGGCCGTGTACTTCGCGGATGTGGGAACGGACATCTGGCTCGCGGTGGACTACTACCTGCG\n", | |
"TGGCCAGCGCTGGTGGTTTGGGCTCACCCTCTTCTTCGTGGTGCTGGGCTCCCTTTCTGTGCAAGTGTTC\n", | |
"AGCTTCCGCTGGTTTGTGCATGATTTCAGCACCGAGGACAGCTCCACGACCACCACCTCCAGCTGCCAGC\n", | |
"AGCCTGGAGCAGATTGCAAGACGGTGGTCAGCAGTGGGTCTGCAGCCGGGGAAGGCGAGGTTCGTCCTTC\n", | |
"CACGCCGCAGAGGCAAGCATCCAACGCCAGCAAGAGCAACATCGCCGCCACCAACAGCGGCAGCAACAGC\n", | |
"AACGGGGCCACCCGGACCAGCGGCAAACACAGGTCTGCGTCCTGCTCCTTTTGCATCTGGCTCCTGCAGT\n", | |
"CACTCATCCACATCTTGCAGCTTGGGCAAATCTGGAGGTATTTGCACACAATATACTTAGGTATCCGGAG\n", | |
"CCGGCAGAGTGGGGAGAGCGGCAGGTGGCGGTTTTACTGGAAGATGGTGTACGAGTATGCAGATGTGAGC\n", | |
"ATGCTGCATCTGCTAGCCACTTTTCTGGAAAGTGCTCCACAATTGGTCCTGCAGCTCTGCATTATTGTAC\n", | |
"AGACTCACAGCTTACAGGCCCTCCAAGGTTTCACAGCAGCAGCCTCCCTTGTGTCCTTGGCTTGGGCCCT\n", | |
"AGCCTCCTACCAGAAGGCTCTTCGGGACTCCCGAGATGACAAAAAGCCCATCAGCTACATGGCTGTCATC\n", | |
"ATTCAGTTCTGCTGGCATTTCTTCACCATCGCTGCCAGGGTCATCACATTCGCCCTCTTTGCCTCGGTTT\n", | |
"TCCAGCTGTATTTTGGGATATTTATTGTCCTCCATTGGTGCATCATGACTTTCTGGATTGTCCACTGTGA\n", | |
"GACAGAATTCTGTATCACCAAATGGGAAGAGATTGTGTTTGACATGGTGGTGGGCATCATCTACATCTTC\n", | |
"AGTTGGTTCAATGTCAAGGAAGGCAGGACACGCTGCAGGCTGTTCATTTACTATTTTGTAATCCTTTTGG\n", | |
"AAAATACAGCCTTGAGTGCACTCTGGTACCTCTACAAAGCTCCCCAGATTGCAGATGCATTTGCCATCCC\n", | |
"TGCATTGTGCGTGGTTTTCAGCAGCTTTTTAACAGGTGTTGTTTTTATGCTGATGTACTATGCCTTCTTT\n", | |
"CATCCCAATGGGCCCAGATTTGGGCAATCACCAAGTTGTGCTTGTGATGATCCAGCCACTGCCTTCTCTC\n", | |
"TGCCTCCAGAAGTAGCCACAAGCACACTACGGTCCATCTCCAACAACCGCAGTGTTGCCAGTGACCGTGA\n", | |
"TCAGAAATTTGCAGAGCGGGATGGATGTGTACCTGTGTTTCAAGTGAGACCAACTGCACCACCCACCCCA\n", | |
"TCATCTCGACCACCACGGATTGAAGAATCAGTCATTAAAATTGACCTGTTCAGGAATAGATATCCAGCAT\n", | |
"GGGAGAGACATGTGTTAGATCGAAGCCTGAGAAAGGCCATTTTAGCCTTTGAATGTTCCCCATCTCCTCC\n", | |
"AAGGCTGCAGTACAAGGATGATGCCCTTATTCAGGAGAGGCTGGAATATGAAACCACTTTATAAAATACA\n", | |
"AGGAGCCGCAATGTCCACATGAAGGGGTAACAGCAGGGCTGTGGCAATAATGACACCTTATCCAAGAGTA\n", | |
"GGGCAGCGAGCTGTATGTTCTTAGTTGTGGTATGGTTTGATCTTCCATCAGCTGACTGCCTGCTGCTGGT\n", | |
"GTCTATTCAAGCCAGCAGTGCTGAGAGTCTCTTACACTGTCAGCTTAATATGACTGTTGCTACAAACTCC\n", | |
"TCCAGCAGAGATTTGGGGCACATTCACTGGAGGATAACATTATTGTGAAAAATGTTGCCTCTAATCATTA\n", | |
"GGGTATTTTGATGGGTTTTACTAAGTTTTGCATAAATATATTCACACACCACCATACCACCCCTCAATCA\n", | |
"AAGGAGTTAAGGTggggatggagagatgactcattagttaagagcactgactgctcttgcaaaggaccca\n", | |
"ggcttgagtagttcactgcaactctaattccagaagatctaatgtccatttttggcctcctcaagcactg\n", | |
"cacacacatggtgcatagacatatatgcaggcaaaatacccatacacatagcataaaaataaaTCTCAAA\n", | |
"GAAAAAAAGCTTAGGTGATTTCCTTGATGCAAAGCTCACAACATACTCCAGGAAGAAAGCAGCATACTTG\n", | |
"GGACAATTATATAAACTGTTCTCTCCTTTGCAAACCAGTAGCATCAATGAAGTGGACAGCAAGACTCAAG\n", | |
"TGTTTACACTCGTACTAACTAGCTTTGATGGGATGATTCTTTTTCTACATATTTCAGGATTTGTTTTTAC\n", | |
"TTTTAGGTTTTGCAGATGAGAACATTCTTCATGACAGAAATCCTATGCAGCACTTATATGGCTTTTGATG\n", | |
"AGACCAAGGAGCTCAATATCTGTAATGTAAATTAAATGCTAATCATAATTCAGTATTCAGTTGCAAAAAT\n", | |
"ACAATATATAAAAAGAGTCTTTGGGGAAGGGACAGAGTGAGATTCAGATTCTCAGGTGTGTGCATCTTAT\n", | |
"ATTGGAATGCACCCACAGAGCCACAGGAGAGGAACAGGGACTATTTCAAGGTCTGTGTTCATGTCTGTTT\n", | |
"CCAGAACTGTTTCCAGGTGCAGAATGACATGGGTCAGCAGGTATGATTCCGGAAACCACGTGCCACATCT\n", | |
"TTCGAGTGCCAAATTTTGTCCAATTACAGAACTGATATGGAATCCCCAAAATCTGAGAATAAGTGGTTTC\n", | |
"CCAAAACAGACAAAAGAAGAATAATCAGGTTCCCTGCTGTGTACAGACTTACCCTCTTCCCATCCAAGGT\n", | |
"CAAAATGATGTGTCTACTAGAGACTTTGGGACACAATTTAGCAAGTGAGAGCATACAGATGCAATGTGTA\n", | |
"TGCCATTAAAAATACTGCCTGGACTGCTTGAGGGCTTACCACTCCATCAGCTAAGATTTGTATTTGAATC\n", | |
"ATCTGTAAATTCGTGCTCTTACAAGCTTCTGAGTTTTAAATACCTCCACACAGCAAGTAAACATTCCCGC\n", | |
"TTTCTGTTTTCGGTGTCCTTGGTCATGGTGCTTTTTGTTGCATTAAAAGTGCCGGTCAAACTTT\n", | |
"```\n", | |
"\n", | |
"Here's a positive strand one from some Y chrom segment:\n", | |
"\n", | |
"```\n", | |
">NM_133362 gene=Erdr1 CDS=230-757\n", | |
"TTCTCTTTTAGCCGCAGCTATGGTTTCTGCCCTAATTATTCTTGTCCTTATTTGtaatttaattcttaat\n", | |
"ttaatttaatttaTAAATTTTGTTGTAAGTTTTTCTGTGGGCGTGAATGGAAAGTCTAACCCGTGTTTCT\n", | |
"CTGTTCAGCCTCCGCCGGTCACGGCCGCCGCCCCCAGCGACGTCACCCACACGCGCAGAAGCGGACGCCG\n", | |
"CGGTCAAGATGTCTCTGCCATGCCCACGggacgcacggacgcacggacgcacggacggacTCCACAAGGT\n", | |
"AGGAAGCCTGCGCCGAGCGCACCGCCGCACCCACCACAGCACACAGGACACACGCGGGCCCCGCGCCCGC\n", | |
"CCAggcacacgcggcacacacggcacacacggcaGGCAGGCCAGGCACACGCATCCGCAGGACCCGCACC\n", | |
"CGCCACGCAGACACGGACGAGCCGCCGCGGTCAAGATGTTCACCCGCCGCGGTCAAGATGTATGTGCCAC\n", | |
"CGACCCTCGCCCCGCTGGACGGACGGACGGACGCGCGCACGCCGTCAGCGTCCACCGGTCACTGCCGCCG\n", | |
"CCCACAGTGATGTCACCCACGAAAGCACACACGTAGAAGCGGACGCCGTGGTCAAGATGTCTCTGCCATC\n", | |
"CCCACAGGACGGACGGACGGACTCCACAAGGTGCGCGTGTCGCCGAGGCCGCCAGGACGGAGCGATTCTC\n", | |
"ACGGAGGAAGGAGCACGCCAACGGGGCCTGACTGCGTACAGAAATGTCCCCCTCAATAAAATTGCAGTTG\n", | |
"AAATGGA\n", | |
"```" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment