Skip to content

Instantly share code, notes, and snippets.

@travc
Created March 27, 2019 21:01
Show Gist options
  • Save travc/f3738350dbeadac4bd1d27058589b034 to your computer and use it in GitHub Desktop.
Save travc/f3738350dbeadac4bd1d27058589b034 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-27T17:59:01.128236Z",
"start_time": "2019-03-27T17:59:00.584687Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.6.7 | packaged by conda-forge | (default, Feb 28 2019, 09:07:38) \n",
"[GCC 7.3.0]\n",
"numpy 1.16.2\n",
"pandas 0.24.1\n",
"allel 1.2.0\n",
"zarr 2.2.0\n"
]
}
],
"source": [
"import sys; print(sys.version)\n",
"import os\n",
"import glob\n",
"import subprocess\n",
"import multiprocessing\n",
"\n",
"import numpy as np; print('numpy', np.__version__)\n",
"\n",
"import pandas as pd; print('pandas',pd.__version__)\n",
"import allel; print('allel', allel.__version__)\n",
"import zarr; print('zarr', zarr.__version__)\n",
"\n",
"from IPython.display import display, HTML\n",
"%matplotlib notebook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make a zarr file from a vcf (using allel 1.1+)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-27T20:52:32.518412Z",
"start_time": "2019-03-27T20:52:32.467263Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using tabix executable 'tabix' → '/usr/bin/tabix'\n",
"tabix (htslib) 1.7-2\n",
"Copyright (C) 2018 Genome Research Ltd.\n",
"\n"
]
}
],
"source": [
"INFN = '/data/vcfs/YL-Aaeg-03_pflit.vcf.gz'\n",
"OUTFN = INFN+'.zarr'\n",
"\n",
"# Below will probably not need to be changed\n",
"FIELDS = [\n",
" 'samples',\n",
" 'variants/CHROM',\n",
" 'variants/POS',\n",
" 'variants/REF',\n",
" 'variants/ALT',\n",
" 'variants/QUAL',\n",
" 'variants/TYPE',\n",
" 'variants/NUMALT',\n",
" 'variants/AF',\n",
" 'variants/DP',\n",
"# 'variants/ANN',\n",
" 'calldata/DP',\n",
" 'calldata/GT',\n",
" ]\n",
"EXCLUDE_FIELDS = None\n",
"\n",
"# # ALTERNATIVE:\n",
"# # All the fields from the VCF (overkill leads to very big archive)\n",
"# FIELDS = '*' \n",
"# EXCLUDE_FIELDS = ['variants/NUMALT'] # allel will calculate a numalt (lower case) on the fly\n",
"\n",
"TABIX_EXEC = 'tabix'\n",
"\n",
"print(\"Using tabix executable '{}' {} '{}'\\n{}\".format(TABIX_EXEC, u\"\\u2192\", \n",
" subprocess.check_output(['which', 'tabix']).decode('utf-8').rstrip(),\n",
" subprocess.check_output([TABIX_EXEC, '--version']).decode('utf-8')))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-27T20:18:14.745633Z",
"start_time": "2019-03-27T20:18:14.707218Z"
}
},
"outputs": [],
"source": [
"chroms = subprocess.check_output([TABIX_EXEC,'-l',INFN], \n",
" universal_newlines=True).strip().split('\\n')\n",
"display(chroms)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Write vcf to zarr archive"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-27T20:49:54.608248Z",
"start_time": "2019-03-27T20:23:49.386058Z"
}
},
"outputs": [],
"source": [
"%%time\n",
"num_procs = multiprocessing.cpu_count()-1\n",
"\n",
"transformers = None\n",
"if 'ANN' in FIELDS:\n",
" transformers=allel.ANNTransformer()\n",
"\n",
"def vcf_to_zarr_func(ch):\n",
" allel.vcf_to_zarr(INFN, OUTFN,\n",
" region=ch,\n",
" group=ch,\n",
" log=sys.stderr,\n",
" fields=FIELDS,\n",
" exclude_fields=EXCLUDE_FIELDS,\n",
" tabix=TABIX_EXEC,\n",
" transformers=transformers)\n",
" \n",
"with multiprocessing.Pool(num_procs) as pool:\n",
" pool.map(vcf_to_zarr_func, chroms, chunksize=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load callset from zarr archive"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-27T20:52:44.588091Z",
"start_time": "2019-03-27T20:52:44.584713Z"
}
},
"outputs": [],
"source": [
"callset = zarr.open_group(OUTFN, mode='r')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2019-03-27T20:54:55.522899Z",
"start_time": "2019-03-27T20:54:55.511592Z"
},
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 513 µs, sys: 133 µs, total: 646 µs\n",
"Wall time: 521 µs\n"
]
},
{
"data": {
"text/html": [
"<link rel=\"stylesheet\" href=\"//cdnjs.cloudflare.com/ajax/libs/jstree/3.3.3/themes/default/style.min.css\"/><div id=\"457985bc-8f07-4ebc-8be7-d7dc68710b79\" class=\"zarr-tree\"><ul><li data-jstree='{\"type\": \"Group\"}' class='jstree-open'><span>2</span><ul><li data-jstree='{\"type\": \"Group\"}' class=''><span>calldata</span><ul><li data-jstree='{\"type\": \"Array\"}' class=''><span>DP (33785440, 132) int16</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>GT (33785440, 132, 2) int8</span></li></ul></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>samples (132,) object</span></li><li data-jstree='{\"type\": \"Group\"}' class=''><span>variants</span><ul><li data-jstree='{\"type\": \"Array\"}' class=''><span>AF (33785440, 3) float32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>ALT (33785440, 3) object</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>CHROM (33785440,) object</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>DP (33785440,) int32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>NUMALT (33785440,) int32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>POS (33785440,) int32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>QUAL (33785440,) float32</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>REF (33785440,) object</span></li><li data-jstree='{\"type\": \"Array\"}' class=''><span>TYPE (33785440, 3) object</span></li></ul></li></ul></li></ul></div>\n",
"<script>\n",
" if (!require.defined('jquery')) {\n",
" require.config({\n",
" paths: {\n",
" jquery: '//cdnjs.cloudflare.com/ajax/libs/jquery/1.12.1/jquery.min'\n",
" },\n",
" });\n",
" }\n",
" if (!require.defined('jstree')) {\n",
" require.config({\n",
" paths: {\n",
" jstree: '//cdnjs.cloudflare.com/ajax/libs/jstree/3.3.3/jstree.min'\n",
" },\n",
" });\n",
" }\n",
" require(['jstree'], function() {\n",
" $('#457985bc-8f07-4ebc-8be7-d7dc68710b79').jstree({\n",
" types: {\n",
" Group: {\n",
" icon: \"fa fa-folder\"\n",
" },\n",
" Array: {\n",
" icon: \"fa fa-table\"\n",
" }\n",
" },\n",
" plugins: [\"types\"]\n",
" });\n",
" });\n",
"</script>\n"
],
"text/plain": [
"2\n",
" ├── calldata\n",
" │ ├── DP (33785440, 132) int16\n",
" │ └── GT (33785440, 132, 2) int8\n",
" ├── samples (132,) object\n",
" └── variants\n",
" ├── AF (33785440, 3) float32\n",
" ├── ALT (33785440, 3) object\n",
" ├── CHROM (33785440,) object\n",
" ├── DP (33785440,) int32\n",
" ├── NUMALT (33785440,) int32\n",
" ├── POS (33785440,) int32\n",
" ├── QUAL (33785440,) float32\n",
" ├── REF (33785440,) object\n",
" └── TYPE (33785440, 3) object"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"callset['2'].tree()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@travc
Copy link
Author

travc commented Mar 27, 2019

Took 24 min to convert a 72GB vcf.gz to zarr on my desktop (16 cores).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment