Last active
July 11, 2017 14:51
-
-
Save jvhaarst/c6ee01f687e1de970f311b41961ef307 to your computer and use it in GitHub Desktop.
Create 500 FASTA sequences, by combining the shortest ones into a bigger ones that are smaller than the maximum size as given by 10x genomics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create 500 sequences, by combining the shortest ones into a bigger ones that are smaller than the maximum size.\n", | |
"Separate by 500Ns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"from os.path import basename\n", | |
"from Bio import SeqIO\n", | |
"from Bio.Seq import Seq\n", | |
"from Bio.SeqRecord import SeqRecord" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"filename = ''\n", | |
"max_number_of_contigs = 500\n", | |
"max_contig_size = pow(2,29)-1 # https://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references\n", | |
"minimal_size=1000\n", | |
"Ns = \"N\" * 500" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Read input FASTA file into list _l_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"handle = open(filename, \"rU\")\n", | |
"l = SeqIO.parse(handle, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Sort list of sequences (_l_) on length to _sortedList_ , short to long" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"sortedList = [f for f in sorted(l, key=lambda x : len(x.seq), reverse=True)]\n", | |
"print(len(sortedList))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Split out the contigs at number_of_contigs, check for minimal length" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"skipped = 0\n", | |
"total_number_of_contigs = len(sortedList)\n", | |
"number_of_contigs_concatenated = 0\n", | |
"concatenated_contigs_index = 0\n", | |
"concatenated_contigs = []\n", | |
"concatenate_list = []\n", | |
"concatenate_list_length = 0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"for idx, val in reversed(list(enumerate(sortedList))):\n", | |
" # Do we still have to concatenate contigs ?\n", | |
" if total_number_of_contigs - number_of_contigs_concatenated + len(concatenated_contigs) >= max_number_of_contigs:\n", | |
" # Check for minimal length requirement\n", | |
" if len(sortedList[idx].seq) >= minimal_size: \n", | |
" # If the size of list + current + N_length < maximum, add to list\n", | |
" if (len(Ns) + len(sortedList[idx].seq) + concatenate_list_length) <= max_contig_size:\n", | |
" concatenate_list.append(sortedList[idx])\n", | |
" concatenate_list_length += len(sortedList[idx].seq)+len(Ns)\n", | |
" number_of_contigs_concatenated +=1\n", | |
" # Clear if used\n", | |
" sortedList[idx] = ''\n", | |
" else:\n", | |
" # Add current concatenated to list of concatenated ones\n", | |
" concatenated_contigs.append(concatenate_list)\n", | |
" # Initiate new list\n", | |
" concatenate_list = []\n", | |
" concatenate_list_length = 0 \n", | |
" concatenate_list.append(sortedList[idx])\n", | |
" concatenate_list_length += len(sortedList[idx].seq)+len(Ns)\n", | |
" number_of_contigs_concatenated +=1 \n", | |
" # Clear if used\n", | |
" sortedList[idx] = '' \n", | |
" else:\n", | |
" skipped += 1\n", | |
" # Clear if used\n", | |
" sortedList[idx] = ''\n", | |
" else:\n", | |
" print \"Done skipping contigs\"\n", | |
" concatenated_contigs.append(concatenate_list)\n", | |
" break\n", | |
"print \"Skipped\",skipped,\"contigs that were shorter than\",minimal_size" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Concatenate contigs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"concatenated_records=[]\n", | |
"for idx, concatenate_list in enumerate(concatenated_contigs):\n", | |
" conseq = Seq(Ns.join([str(seq_rec.seq) for seq_rec in concatenate_list]))\n", | |
" conseq_descriptions = '|'.join([str(seq_rec.id) for seq_rec in concatenate_list])\n", | |
" conseq_size = float(sum([len(seq_rec.seq) for seq_rec in concatenate_list]))\n", | |
" conseq_r = SeqRecord(conseq)\n", | |
" conseq_r.id = str(len(concatenate_list))+'_concatenated_sequences'\n", | |
" concatenated_records.append(conseq_r) \n", | |
" print(idx,len(concatenate_list),conseq_size,conseq_size/max_contig_size)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"concatenated_records" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Remove empty records from list" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sortedList = filter(None, sortedList)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Write out the different files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#outfile = basename(os.path.splitext(filename)[0])+ '.largest.' + str(max_number_of_contigs) + '.fasta'\n", | |
"outfile = basename(filename) + '.largest.' + str(len(sortedList)) + '.' + str(minimal_size) + '.fasta'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"outfile" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"SeqIO.write(sortedList, outfile, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! ls -lh {outfile}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#outfile = basename(os.path.splitext(filename)[0]) + '.notlargest.' + str(max_number_of_contigs) + '.concatenated.fasta'\n", | |
"outfile = basename(filename) + '.notlargest.' + str(len(concatenated_records)) + '.' + str(minimal_size) + '.concatenated.fasta'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"outfile" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"SeqIO.write(concatenated_records, outfile, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! ls -lh {outfile}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"outfile = basename(filename) + '.' + str(max_number_of_contigs) + '.' + str(minimal_size) + '.combined.fasta'\n", | |
"outfile" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sortedList += concatenated_records" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"SeqIO.write(sortedList, outfile, \"fasta\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"! ls -lh {outfile}" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda root]", | |
"language": "python", | |
"name": "conda-root-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment