Skip to content

Instantly share code, notes, and snippets.

@ivirshup
Last active June 18, 2024 17:05
Show Gist options
  • Select an option

  • Save ivirshup/eb4f5beb1bb33724b8c11bd0eacf03a6 to your computer and use it in GitHub Desktop.

Select an option

Save ivirshup/eb4f5beb1bb33724b8c11bd0eacf03a6 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scanpy==1.10.1 anndata==0.10.7 umap==0.5.6 numpy==1.26.4 scipy==1.13.1 pandas==2.2.2 scikit-learn==1.5.0 statsmodels==0.14.2 pynndescent==0.5.12\n"
]
}
],
"source": [
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import dask.distributed as dd\n",
"import dask.array as da\n",
"import scanpy as sc\n",
"import anndata as ad\n",
"import h5py\n",
"from scipy import sparse\n",
"\n",
"sc.logging.print_header()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"if not Path(\"cell_atlas.h5ad\").exists():\n",
" !wget https://datasets.cellxgene.cziscience.com/82eac9c1-485f-4e21-ab21-8510823d4f6e.h5ad -O \"cell_atlas.h5ad\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n",
"Perhaps you already have a cluster running?\n",
"Hosting the HTTP server on port 41865 instead\n",
" warnings.warn(\n"
]
}
],
"source": [
"cluster = dd.LocalCluster()\n",
"client = dd.Client(cluster)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-f3c3d3bd-2394-11ef-a73d-023ca6c22285</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
"\n",
" <tr>\n",
" \n",
" <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
" <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
" \n",
" </tr>\n",
"\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41865/status\" target=\"_blank\">http://127.0.0.1:41865/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" \n",
"\n",
" </table>\n",
"\n",
" \n",
"\n",
" \n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
" <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
" </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">1a01962c</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:41865/status\" target=\"_blank\">http://127.0.0.1:41865/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 32\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 123.85 GiB\n",
" </td>\n",
" </tr>\n",
" \n",
" <tr>\n",
" <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
" <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
"</tr>\n",
"\n",
" \n",
" </table>\n",
"\n",
" <details>\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
" </summary>\n",
"\n",
" <div style=\"\">\n",
" <div>\n",
" <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
" <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-582cc601-d430-4f0d-a9ea-92344689e753</p>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm:</strong> tcp://127.0.0.1:32833\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Workers:</strong> 8\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:41865/status\" target=\"_blank\">http://127.0.0.1:41865/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads:</strong> 32\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Started:</strong> Just now\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total memory:</strong> 123.85 GiB\n",
" </td>\n",
" </tr>\n",
" </table>\n",
" </div>\n",
" </div>\n",
"\n",
" <details style=\"margin-left: 48px;\">\n",
" <summary style=\"margin-bottom: 20px;\">\n",
" <h3 style=\"display: inline;\">Workers</h3>\n",
" </summary>\n",
"\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:45181\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:38683/status\" target=\"_blank\">http://127.0.0.1:38683/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:43245\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-01khqjru\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:36445\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44831/status\" target=\"_blank\">http://127.0.0.1:44831/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:40505\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-3zz6rptv\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:32969\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44179/status\" target=\"_blank\">http://127.0.0.1:44179/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:46023\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-ti3t62pw\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:34297\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:38939/status\" target=\"_blank\">http://127.0.0.1:38939/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:44411\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-5vxc4_02\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:32839\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:43465/status\" target=\"_blank\">http://127.0.0.1:43465/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:40751\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-nxicu2i6\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:39331\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:38839/status\" target=\"_blank\">http://127.0.0.1:38839/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:42489\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-9t1krvwz\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 6</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:40061\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:36305/status\" target=\"_blank\">http://127.0.0.1:36305/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:36879\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-5j5rzyxp\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
" <div style=\"margin-bottom: 20px;\">\n",
" <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
" <div style=\"margin-left: 48px;\">\n",
" <details>\n",
" <summary>\n",
" <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 7</h4>\n",
" </summary>\n",
" <table style=\"width: 100%; text-align: left;\">\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Comm: </strong> tcp://127.0.0.1:34211\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Total threads: </strong> 4\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34455/status\" target=\"_blank\">http://127.0.0.1:34455/status</a>\n",
" </td>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Memory: </strong> 15.48 GiB\n",
" </td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left;\">\n",
" <strong>Nanny: </strong> tcp://127.0.0.1:38233\n",
" </td>\n",
" <td style=\"text-align: left;\"></td>\n",
" </tr>\n",
" <tr>\n",
" <td colspan=\"2\" style=\"text-align: left;\">\n",
" <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-8vswbt5f\n",
" </td>\n",
" </tr>\n",
"\n",
" \n",
"\n",
" \n",
"\n",
" </table>\n",
" </details>\n",
" </div>\n",
" </div>\n",
" \n",
"\n",
" </details>\n",
"</div>\n",
"\n",
" </details>\n",
" </div>\n",
"</div>\n",
" </details>\n",
" \n",
"\n",
" </div>\n",
"</div>"
],
"text/plain": [
"<Client: 'tcp://127.0.0.1:32833' processes=8 threads=32, memory=123.85 GiB>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"SPARSE_CHUNK_SIZE = 5_000\n",
"DENSE_CHUNK_SIZE = 5_000"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def read_sparse_as_dask(file_pth: str, elem_name: str, stride: int):\n",
" with h5py.File(file_pth, \"r\") as f:\n",
" elem = f[elem_name]\n",
" shape = elem.attrs[\"shape\"]\n",
" if (encoding_type := elem.attrs[\"encoding-type\"]) != \"csr_matrix\":\n",
" raise ValueError(\n",
" f\"This method was only written for csr_matrix encoding, but a {encoding_type} encoding was found.\"\n",
" )\n",
" dtype = elem[\"data\"].dtype\n",
"\n",
" def make_dask_chunk(block_id=None):\n",
" # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed`\n",
" # https://github.com/scverse/anndata/issues/1105\n",
" with h5py.File(file_pth, \"r\") as f:\n",
" mtx = ad.experimental.sparse_dataset(f[elem_name])\n",
" (row, _) = block_id\n",
" chunk = mtx[\n",
" slice(\n",
" row * stride,\n",
" min((row * stride) + stride, shape[0]),\n",
" )\n",
" ]\n",
" return chunk\n",
"\n",
" chunks_0 = (stride,) * (shape[0] // stride)\n",
" chunks_0 += (shape[0] % stride,)\n",
" chunks_1 = (shape[1],)\n",
" da_mtx = da.map_blocks(\n",
" make_dask_chunk,\n",
" dtype=dtype,\n",
" chunks=(chunks_0, chunks_1),\n",
" meta=sparse.csr_matrix((0, 0), dtype=np.float32),\n",
" )\n",
" return da_mtx"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.66 s, sys: 939 ms, total: 4.6 s\n",
"Wall time: 4.83 s\n"
]
},
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 1462702 × 27714\n",
" obs: 'celltype', 'majorType', 'City', 'sampleID', 'donor_id', 'Sample type', 'CoVID-19 severity', 'Sample time', 'Sampling day (Days after symptom onset)', 'BCR single cell sequencing', 'TCR single cell sequencing', 'Outcome', 'Comorbidities', 'COVID-19-related medication and anti-microbials', 'Leukocytes [G over L]', 'Neutrophils [G over L]', 'Lymphocytes [G over L]', 'Unpublished', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'\n",
" var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"with h5py.File(\"cell_atlas.h5ad\", \"r\") as f:\n",
" adata = ad.AnnData(\n",
" obs=ad.experimental.read_elem(f[\"obs\"]),\n",
" var=ad.experimental.read_elem(f[\"var\"]),\n",
" )\n",
"\n",
"adata.X = read_sparse_as_dask(\"cell_atlas.h5ad\", \"raw/X\", SPARSE_CHUNK_SIZE)\n",
"adata"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"by, _ = sc.get._aggregated._combine_categories(adata.obs, [\"donor_id\", \"cell_type\"])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"X_dense = adata.X.map_blocks(\n",
" lambda x: x.toarray(), dtype=adata.X.dtype, meta=np.array([])\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import flox"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/distributed/client.py:3164: UserWarning: Sending large graph of size 11.21 MiB.\n",
"This may cause some slowdown.\n",
"Consider scattering data ahead of time and using futures.\n",
" warnings.warn(\n",
"2024-06-05 23:40:44,446 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 11.13 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:44,740 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 11.09 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:45,611 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 11.55 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:46,036 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 12.53 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:46,448 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 12.48 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:47,045 - distributed.worker.memory - WARNING - Worker is at 78% memory usage. Resuming worker. Process memory: 12.17 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:47,318 - distributed.worker.memory - WARNING - Worker is at 74% memory usage. Resuming worker. Process memory: 11.51 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:47,346 - distributed.worker.memory - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 12.42 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:40:47,562 - distributed.worker.memory - WARNING - Worker is at 66% memory usage. Resuming worker. Process memory: 10.28 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:03,964 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 11.17 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:05,989 - distributed.worker.memory - WARNING - Worker is at 81% memory usage. Pausing worker. Process memory: 12.68 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:05,994 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 12.69 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:06,412 - distributed.worker.memory - WARNING - Worker is at 78% memory usage. Resuming worker. Process memory: 12.16 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:33,528 - distributed.worker.memory - WARNING - Worker is at 81% memory usage. Pausing worker. Process memory: 12.55 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:34,079 - distributed.worker.memory - WARNING - Worker is at 77% memory usage. Resuming worker. Process memory: 12.02 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:43,622 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 11.54 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:41:47,859 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 10.23 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:42:12,931 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 12.12 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:42:14,029 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 11.71 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:43:09,845 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 9.84 GiB -- Worker memory limit: 15.48 GiB\n",
"2024-06-05 23:43:12,451 - distributed.worker.memory - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os for more information. -- Unmanaged memory: 9.44 GiB -- Worker memory limit: 15.48 GiB\n"
]
}
],
"source": [
"res, codes = flox.groupby_reduce(\n",
" X_dense.T,\n",
" by,\n",
" func=\"sum\",\n",
" fill_value=0,\n",
" method=\"map-reduce\",\n",
" reindex=True,\n",
")\n",
"\n",
"res_comp = res.compute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "cellxgene-census-dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment