Created
March 27, 2019 21:01
-
-
Save travc/f3738350dbeadac4bd1d27058589b034 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-27T17:59:01.128236Z", | |
"start_time": "2019-03-27T17:59:00.584687Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.6.7 | packaged by conda-forge | (default, Feb 28 2019, 09:07:38) \n", | |
"[GCC 7.3.0]\n", | |
"numpy 1.16.2\n", | |
"pandas 0.24.1\n", | |
"allel 1.2.0\n", | |
"zarr 2.2.0\n" | |
] | |
} | |
], | |
"source": [ | |
"import sys; print(sys.version)\n", | |
"import os\n", | |
"import glob\n", | |
"import subprocess\n", | |
"import multiprocessing\n", | |
"\n", | |
"import numpy as np; print('numpy', np.__version__)\n", | |
"\n", | |
"import pandas as pd; print('pandas',pd.__version__)\n", | |
"import allel; print('allel', allel.__version__)\n", | |
"import zarr; print('zarr', zarr.__version__)\n", | |
"\n", | |
"from IPython.display import display, HTML\n", | |
"%matplotlib notebook" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Make a zarr file from a vcf (using allel 1.1+)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-27T20:52:32.518412Z", | |
"start_time": "2019-03-27T20:52:32.467263Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using tabix executable 'tabix' → '/usr/bin/tabix'\n", | |
"tabix (htslib) 1.7-2\n", | |
"Copyright (C) 2018 Genome Research Ltd.\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"INFN = '/data/vcfs/YL-Aaeg-03_pflit.vcf.gz'\n", | |
"OUTFN = INFN+'.zarr'\n", | |
"\n", | |
"# Below will probably not need to be changed\n", | |
"FIELDS = [\n", | |
" 'samples',\n", | |
" 'variants/CHROM',\n", | |
" 'variants/POS',\n", | |
" 'variants/REF',\n", | |
" 'variants/ALT',\n", | |
" 'variants/QUAL',\n", | |
" 'variants/TYPE',\n", | |
" 'variants/NUMALT',\n", | |
" 'variants/AF',\n", | |
" 'variants/DP',\n", | |
"# 'variants/ANN',\n", | |
" 'calldata/DP',\n", | |
" 'calldata/GT',\n", | |
" ]\n", | |
"EXCLUDE_FIELDS = None\n", | |
"\n", | |
"# # ALTERNATIVE:\n", | |
"# # All the fields from the VCF (overkill leads to very big archive)\n", | |
"# FIELDS = '*' \n", | |
"# EXCLUDE_FIELDS = ['variants/NUMALT'] # allel will calculate a numalt (lower case) on the fly\n", | |
"\n", | |
"TABIX_EXEC = 'tabix'\n", | |
"\n", | |
"print(\"Using tabix executable '{}' {} '{}'\\n{}\".format(TABIX_EXEC, u\"\\u2192\", \n", | |
" subprocess.check_output(['which', 'tabix']).decode('utf-8').rstrip(),\n", | |
" subprocess.check_output([TABIX_EXEC, '--version']).decode('utf-8')))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-27T20:18:14.745633Z", | |
"start_time": "2019-03-27T20:18:14.707218Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"chroms = subprocess.check_output([TABIX_EXEC,'-l',INFN], \n", | |
" universal_newlines=True).strip().split('\\n')\n", | |
"display(chroms)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Write vcf to zarr archive" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-27T20:49:54.608248Z", | |
"start_time": "2019-03-27T20:23:49.386058Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"num_procs = multiprocessing.cpu_count()-1\n", | |
"\n", | |
"transformers = None\n", | |
"if 'ANN' in FIELDS:\n", | |
" transformers=allel.ANNTransformer()\n", | |
"\n", | |
"def vcf_to_zarr_func(ch):\n", | |
" allel.vcf_to_zarr(INFN, OUTFN,\n", | |
" region=ch,\n", | |
" group=ch,\n", | |
" log=sys.stderr,\n", | |
" fields=FIELDS,\n", | |
" exclude_fields=EXCLUDE_FIELDS,\n", | |
" tabix=TABIX_EXEC,\n", | |
" transformers=transformers)\n", | |
" \n", | |
"with multiprocessing.Pool(num_procs) as pool:\n", | |
" pool.map(vcf_to_zarr_func, chroms, chunksize=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Load callset from zarr archive" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-27T20:52:44.588091Z", | |
"start_time": "2019-03-27T20:52:44.584713Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"callset = zarr.open_group(OUTFN, mode='r')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2019-03-27T20:54:55.522899Z", | |
"start_time": "2019-03-27T20:54:55.511592Z" | |
}, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 513 µs, sys: 133 µs, total: 646 µs\n", | |
"Wall time: 521 µs\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<link rel=\"stylesheet\" href=\"//cdnjs.cloudflare.com/ajax/libs/jstree/3.3.3/themes/default/style.min.css\"/><div id=\"457985bc-8f07-4ebc-8be7-d7dc68710b79\" class=\"zarr-tree\"><ul><li data-jstree='{\"type\": \"Group\"}' class='jstree-open'><span>2</span><ul><li data-jstree='{\"type\": \"Group\"}' class=''><span>calldata</span><ul><li data-jstree='{\"type\": \"Array\"}' class=''><span>DP (33785440, 132) int16</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>GT (33785440, 132, 2) int8</span></li></ul></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>samples (132,) object</span></li><li data-jstree='{\"type\": \"Group\"}' class=''><span>variants</span><ul><li data-jstree='{\"type\": \"Array\"}' class=''><span>AF (33785440, 3) float32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>ALT (33785440, 3) object</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>CHROM (33785440,) object</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>DP (33785440,) int32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>NUMALT (33785440,) int32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>POS (33785440,) int32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>QUAL (33785440,) float32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>REF (33785440,) object</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>TYPE (33785440, 3) object</span></li></ul></li></ul></li></ul></div>\n", | |
"<script>\n", | |
" if (!require.defined('jquery')) {\n", | |
" require.config({\n", | |
" paths: {\n", | |
" jquery: '//cdnjs.cloudflare.com/ajax/libs/jquery/1.12.1/jquery.min'\n", | |
" },\n", | |
" });\n", | |
" }\n", | |
" if (!require.defined('jstree')) {\n", | |
" require.config({\n", | |
" paths: {\n", | |
" jstree: '//cdnjs.cloudflare.com/ajax/libs/jstree/3.3.3/jstree.min'\n", | |
" },\n", | |
" });\n", | |
" }\n", | |
" require(['jstree'], function() {\n", | |
" $('#457985bc-8f07-4ebc-8be7-d7dc68710b79').jstree({\n", | |
" types: {\n", | |
" Group: {\n", | |
" icon: \"fa fa-folder\"\n", | |
" },\n", | |
" Array: {\n", | |
" icon: \"fa fa-table\"\n", | |
" }\n", | |
" },\n", | |
" plugins: [\"types\"]\n", | |
" });\n", | |
" });\n", | |
"</script>\n" | |
], | |
"text/plain": [ | |
"2\n", | |
" ├── calldata\n", | |
" │ ├── DP (33785440, 132) int16\n", | |
" │ └── GT (33785440, 132, 2) int8\n", | |
" ├── samples (132,) object\n", | |
" └── variants\n", | |
" ├── AF (33785440, 3) float32\n", | |
" ├── ALT (33785440, 3) object\n", | |
" ├── CHROM (33785440,) object\n", | |
" ├── DP (33785440,) int32\n", | |
" ├── NUMALT (33785440,) int32\n", | |
" ├── POS (33785440,) int32\n", | |
" ├── QUAL (33785440,) float32\n", | |
" ├── REF (33785440,) object\n", | |
" └── TYPE (33785440, 3) object" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"callset['2'].tree()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Took 24 min to convert a 72GB vcf.gz to zarr on my desktop (16 cores).