ayaksvals · August 14, 2024 15:24
diff --git a/csv_sort_parquet(DaskVersion).ipynb b/csv_sort_parquet(DaskVersion).ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d30d1c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import bioframe\n",
    "import pypairix\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4efa19c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "pypairix.build_index('NIPBL_R1.nodups.pairs.gz', force=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8a4bb79",
   "metadata": {},
   "outputs": [],
   "source": [
    "chromsizes = bioframe.fetch_chromsizes('mm9')\n",
    "chromsizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "62a9db8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import division, print_function, absolute_import\n",
    "from collections import OrderedDict\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import numba\n",
    "\n",
    "import pypairix\n",
    "import pysam\n",
    "from dask.base import tokenize\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da\n",
    "import dask\n",
    "from dask.dataframe.core import new_dd_object\n",
    "\n",
    "def bin2start(k):\n",
    "    lev = np.floor(np.log2(7*k + 1)/3).astype(int)\n",
    "    sl = 2**(29 - 3*lev)\n",
    "    ol = (2**(3*lev) - 1)//7\n",
    "    start = (k - ol) * sl\n",
    "    end = (k - ol+1) * sl\n",
    "    return start\n",
    "\n",
    "LEVEL = {}\n",
    "LEVEL[0] = bin2start(np.arange(1, 9))\n",
    "LEVEL[1] = bin2start(np.arange(9, 73))\n",
    "LEVEL[2] = bin2start(np.arange(73,585))\n",
    "LEVEL[3] = bin2start(np.arange(585,4681))\n",
    "LEVEL[4] = bin2start(np.arange(4681,37449))\n",
    "\n",
    "\n",
    "@numba.jit(\"int32(int32, int32)\")\n",
    "def reg2bin(beg, end):\n",
    "    end -= 1\n",
    "    if beg >> 14 == end >> 14: \n",
    "        return ((1 << 15)-1) // 7 + (beg >> 14)\n",
    "    if beg >> 17 == end >> 17: \n",
    "        return ((1 << 12)-1) // 7 + (beg >> 17)\n",
    "    if beg >> 20 == end >> 20: \n",
    "        return ((1 << 9)-1) // 7 + (beg >> 20)\n",
    "    if beg >> 23 == end >> 23: \n",
    "        return ((1 << 6)-1) // 7 + (beg >> 23)\n",
    "    if beg >> 26 == end >> 26: \n",
    "        return ((1 << 3)-1) // 7 + (beg >> 26)\n",
    "    return 0\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "@numba.jit\n",
    "def reg2bins(rbeg, rend):\n",
    "    lst = []\n",
    "\n",
    "    rend -= 1    \n",
    "    k = 1 + (rbeg >> 26)\n",
    "    while k <= (1 + (rend >> 26)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 9 + (rbeg >> 23)\n",
    "    while k <= (9 + (rend >> 23)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 73 + (rbeg >> 20)\n",
    "    while k <= (73 + (rend >> 20)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 585 + (rbeg >> 17)\n",
    "    while k <= (585 + (rend >> 17)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 4681 + (rbeg >> 14)\n",
    "    while k <= (4681 + (rend >> 14)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    return lst\n",
    "\n",
    "\n",
    "\n",
    "def range_partition(start, stop, step):\n",
    "    return ((i, min(i+step, stop))\n",
    "                for i in range(start, stop, step))\n",
    "\n",
    "\n",
    "def _fetch_region(filepath, chromsizes, slc, chrom1, chrom2=None, \n",
    "                 columns=None, usecols=None, meta=None):\n",
    "    if chrom2 is None:\n",
    "        chrom2 = chrom1\n",
    "    if slc is None:\n",
    "        start, end = 0, chromsizes[chrom1]\n",
    "    else:\n",
    "        start, end = slc.start, slc.stop\n",
    "    f = pypairix.open(filepath, 'r')\n",
    "    df = pd.DataFrame.from_records(\n",
    "        f.query2D(chrom1, start, end, chrom2, 0, chromsizes[chrom2]), \n",
    "        columns=columns)\n",
    "\n",
    "    if not len(df):\n",
    "        df = meta.copy()\n",
    "    elif usecols is not None:\n",
    "        usecols = set(usecols)\n",
    "        df = df[[col for col in meta.columns if col in usecols]]\n",
    "\n",
    "    for col, dt in meta.dtypes.items():\n",
    "        df.loc[:, col] = df.loc[:, col].astype(dt)\n",
    "\n",
    "    # nasty hack!\n",
    "    if len(df) == 0:\n",
    "        class fake_loc:\n",
    "            def __init__(self, obj):\n",
    "                self.obj = obj\n",
    "            def __call__(self, *args):\n",
    "                return self.obj\n",
    "            def __getitem__(self, *args):\n",
    "                return self.obj\n",
    "        df._loc = fake_loc(df)\n",
    "    return df\n",
    "\n",
    "\n",
    "def daskify_pairix_block(filepath, chromsizes, chrom1, chrom2=None, \n",
    "                         columns=None, dtypes=None, usecols=None, \n",
    "                         chunk_level=2):\n",
    "    nrows = chromsizes[chrom1]\n",
    "    meta = pd.read_csv(\n",
    "        filepath, \n",
    "        sep='\\t', \n",
    "        comment='#', \n",
    "        header=None,\n",
    "        names=columns,\n",
    "        dtype=dtypes,\n",
    "        usecols=usecols,\n",
    "        iterator=True).read(1024).iloc[0:0]\n",
    "\n",
    "    # Make a unique task name\n",
    "    token = tokenize(filepath, chromsizes, chrom1, chrom2, \n",
    "                     columns, dtypes, usecols, chunk_level)\n",
    "    task_name = 'daskify-pairix-' + token\n",
    "\n",
    "    # Build the task graph\n",
    "    divisions = []\n",
    "    dsk = {}\n",
    "    edges = LEVEL[chunk_level]\n",
    "    edges = edges[:np.searchsorted(edges, nrows)]\n",
    "    if edges[-1] != nrows:\n",
    "        edges = np.r_[edges, nrows]\n",
    "    spans = zip(edges[:-1], edges[1:])\n",
    "\n",
    "    \n",
    "    for i, (lo, hi) in enumerate(spans):\n",
    "        divisions.append(hi-1)\n",
    "        slc = slice(lo, hi)\n",
    "        dsk[task_name, i] = (_fetch_region, \n",
    "                             filepath, chromsizes, slc, \n",
    "                             chrom1, chrom2, columns, usecols, meta)\n",
    "    \n",
    "    # Generate ddf from dask graph\n",
    "    #return dd.DataFrame(dsk, task_name, meta, tuple(divisions))\n",
    "    return new_dd_object(dsk, task_name, meta, tuple(divisions))\n",
    "\n",
    "    #return dd.DataFrame(dsk)\n",
    "\n",
    "\n",
    "def daskify_pairix(filepath, chromsizes, **kwargs):\n",
    "    f = pypairix.open(filepath)\n",
    "    blocks = [s.split('|') for s in f.get_blocknames()]\n",
    "    d = OrderedDict()\n",
    "    for chrom1, chrom2 in blocks:\n",
    "        if chrom1 in chromsizes and chrom2 in chromsizes:\n",
    "            d[chrom1, chrom2] = daskify_pairix_block(\n",
    "                filepath, chromsizes, chrom1, chrom2, **kwargs)\n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "25da0068",
   "metadata": {},
   "outputs": [],
   "source": [
    "pairs = daskify_pairix(\n",
    "    'NIPBL_R1.nodups.pairs.gz', \n",
    "    chromsizes,\n",
    "    chunk_level=0,  # LEVEL 0\n",
    "    columns=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type']\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b657f9d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "pairs_concat = dd.from_delayed(list(itertools.chain.from_iterable(pairs[key].to_delayed() for key in pairs)))\n",
    "pairs_concat = pairs_concat.astype({'pos1': 'int64', 'pos2': 'int64'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9e866592",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=382</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>int64</td>\n",
       "      <td>string</td>\n",
       "      <td>int64</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<div>Dask Name: astype, 3 expressions</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "                read_id  chrom1   pos1  chrom2   pos2 strand1 strand2 pair_type\n",
       "npartitions=382                                                                \n",
       "                 string  string  int64  string  int64  string  string    string\n",
       "                    ...     ...    ...     ...    ...     ...     ...       ...\n",
       "...                 ...     ...    ...     ...    ...     ...     ...       ...\n",
       "                    ...     ...    ...     ...    ...     ...     ...       ...\n",
       "                    ...     ...    ...     ...    ...     ...     ...       ...\n",
       "Dask Name: astype, 3 expressions\n",
       "Expr=AsType(frame=FromDelayed(1a2469e), dtypes={'pos1': 'int64', 'pos2': 'int64'})"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairs_concat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "a331f83f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000037</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3139797</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000088</td>\n",
       "      <td>chr1</td>\n",
       "      <td>42957984</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000112</td>\n",
       "      <td>chr1</td>\n",
       "      <td>5448692</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000227</td>\n",
       "      <td>chr1</td>\n",
       "      <td>27069655</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000228</td>\n",
       "      <td>chr1</td>\n",
       "      <td>87485253</td>\n",
       "      <td>+</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3921</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133707813</td>\n",
       "      <td>chrY</td>\n",
       "      <td>581668</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3922</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133811937</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2889436</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3923</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133914589</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2899409</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3924</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133920996</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2897236</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3925</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>134027770</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2887831</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45384978 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     read_id chrom1       pos1 chrom2      pos2 strand1 strand2 pair_type\n",
       "0          .   chr1    3000037   chr1   3139797       -       -        LL\n",
       "1          .   chr1    3000088   chr1  42957984       -       +        LL\n",
       "2          .   chr1    3000112   chr1   5448692       -       +        LL\n",
       "3          .   chr1    3000227   chr1  27069655       +       -        LL\n",
       "4          .   chr1    3000228   chr1  87485253       +       +        LL\n",
       "...      ...    ...        ...    ...       ...     ...     ...       ...\n",
       "3921       .   chrX  133707813   chrY    581668       +       -        LL\n",
       "3922       .   chrX  133811937   chrY   2889436       -       +        LL\n",
       "3923       .   chrX  133914589   chrY   2899409       +       -        LL\n",
       "3924       .   chrX  133920996   chrY   2897236       -       -        LL\n",
       "3925       .   chrX  134027770   chrY   2887831       -       -        LL\n",
       "\n",
       "[45384978 rows x 8 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairs_concat.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "4c0f077b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#ADDED SORT\n",
    "# Sort Time+Compute=7m 50s; to parquet=21s\n",
    "# Sort: 0s, Compute+to parquet: 14m\n",
    "sorted_df = pairs_concat.sort_values(by=['chrom1', 'chrom2', 'pos1', 'pos2', 'strand1', 'strand2'])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "cbdc3013",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Each partition will be written to a separate file. => Not what we are looking for\n",
    "\"\"\" \n",
    "path : string or pathlib.Path\n",
    "        Destination directory for data.  Prepend with protocol like ``s3://``\n",
    "        or ``hdfs://`` for remote data.\n",
    "compression : string or dict, default 'snappy'\n",
    "        Either a string like ``\"snappy\"`` or a dictionary mapping column names\n",
    "        to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n",
    "        to ``\"snappy\"``.\n",
    "compute : bool, default True\n",
    "        If ``True`` (default) then the result is computed immediately. If\n",
    "        ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n",
    "        future computation.\n",
    "schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n",
    "        Global schema to use for the output dataset. Defaults to \"infer\", which\n",
    "        will infer the schema from the dask dataframe metadata. This is usually\n",
    "        sufficient for common schemas, but notably will fail for ``object``\n",
    "        dtype columns that contain things other than strings. These columns\n",
    "        will require an explicit schema be specified. The schema for a subset\n",
    "        of columns can be overridden by passing in a dict of column names to\n",
    "        pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n",
    "        not present in this dict will still be automatically inferred.\n",
    "        Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n",
    "        no schema inference will be done. Passing in ``schema=None`` will\n",
    "        disable the use of a global file schema - each written file may use a\n",
    "        different schema dependent on the dtypes of the corresponding\n",
    "        partition.\n",
    "\"\"\"\n",
    "\n",
    "out = sorted_df.to_parquet(\n",
    "    'sortCompute.parquet', \n",
    "    compression='snappy', compute = True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "f15833ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=19</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>int64</td>\n",
       "      <td>string</td>\n",
       "      <td>int64</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<div>Dask Name: read_parquet, 1 expression</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "               read_id  chrom1   pos1  chrom2   pos2 strand1 strand2 pair_type\n",
       "npartitions=19                                                                \n",
       "                string  string  int64  string  int64  string  string    string\n",
       "                   ...     ...    ...     ...    ...     ...     ...       ...\n",
       "...                ...     ...    ...     ...    ...     ...     ...       ...\n",
       "                   ...     ...    ...     ...    ...     ...     ...       ...\n",
       "                   ...     ...    ...     ...    ...     ...     ...       ...\n",
       "Dask Name: read_parquet, 1 expression\n",
       "Expr=ReadParquetFSSpec(d6dc111)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = dd.read_parquet('sortCompute.parquet')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a9811098",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000037</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3139797</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000088</td>\n",
       "      <td>chr1</td>\n",
       "      <td>42957984</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000112</td>\n",
       "      <td>chr1</td>\n",
       "      <td>5448692</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000227</td>\n",
       "      <td>chr1</td>\n",
       "      <td>27069655</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>.</td>\n",
       "      <td>chr1</td>\n",
       "      <td>3000228</td>\n",
       "      <td>chr1</td>\n",
       "      <td>87485253</td>\n",
       "      <td>+</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3921</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133707813</td>\n",
       "      <td>chrY</td>\n",
       "      <td>581668</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3922</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133811937</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2889436</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3923</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133914589</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2899409</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3924</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>133920996</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2897236</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3925</th>\n",
       "      <td>.</td>\n",
       "      <td>chrX</td>\n",
       "      <td>134027770</td>\n",
       "      <td>chrY</td>\n",
       "      <td>2887831</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45384978 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     read_id chrom1       pos1 chrom2      pos2 strand1 strand2 pair_type\n",
       "0          .   chr1    3000037   chr1   3139797       -       -        LL\n",
       "1          .   chr1    3000088   chr1  42957984       -       +        LL\n",
       "2          .   chr1    3000112   chr1   5448692       -       +        LL\n",
       "3          .   chr1    3000227   chr1  27069655       +       -        LL\n",
       "4          .   chr1    3000228   chr1  87485253       +       +        LL\n",
       "...      ...    ...        ...    ...       ...     ...     ...       ...\n",
       "3921       .   chrX  133707813   chrY    581668       +       -        LL\n",
       "3922       .   chrX  133811937   chrY   2889436       -       +        LL\n",
       "3923       .   chrX  133914589   chrY   2899409       +       -        LL\n",
       "3924       .   chrX  133920996   chrY   2897236       -       -        LL\n",
       "3925       .   chrX  134027770   chrY   2887831       -       -        LL\n",
       "\n",
       "[45384978 rows x 8 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.compute()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "main",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "d30d1c6f",
	"metadata": {},
	"outputs": [],
	"source": [
	"import bioframe\n",
	"import pypairix\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "4efa19c2",
	"metadata": {},
	"outputs": [],
	"source": [
	"pypairix.build_index('NIPBL_R1.nodups.pairs.gz', force=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "a8a4bb79",
	"metadata": {},
	"outputs": [],
	"source": [
	"chromsizes = bioframe.fetch_chromsizes('mm9')\n",
	"chromsizes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "62a9db8f",
	"metadata": {},
	"outputs": [],
	"source": [
	"from __future__ import division, print_function, absolute_import\n",
	"from collections import OrderedDict\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import numba\n",
	"\n",
	"import pypairix\n",
	"import pysam\n",
	"from dask.base import tokenize\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da\n",
	"import dask\n",
	"from dask.dataframe.core import new_dd_object\n",
	"\n",
	"def bin2start(k):\n",
	" lev = np.floor(np.log2(7*k + 1)/3).astype(int)\n",
	" sl = 2*(29 - 3lev)\n",
	" ol = (2*(3lev) - 1)//7\n",
	" start = (k - ol) * sl\n",
	" end = (k - ol+1) * sl\n",
	" return start\n",
	"\n",
	"LEVEL = {}\n",
	"LEVEL[0] = bin2start(np.arange(1, 9))\n",
	"LEVEL[1] = bin2start(np.arange(9, 73))\n",
	"LEVEL[2] = bin2start(np.arange(73,585))\n",
	"LEVEL[3] = bin2start(np.arange(585,4681))\n",
	"LEVEL[4] = bin2start(np.arange(4681,37449))\n",
	"\n",
	"\n",
	"@numba.jit(\"int32(int32, int32)\")\n",
	"def reg2bin(beg, end):\n",
	" end -= 1\n",
	" if beg >> 14 == end >> 14: \n",
	" return ((1 << 15)-1) // 7 + (beg >> 14)\n",
	" if beg >> 17 == end >> 17: \n",
	" return ((1 << 12)-1) // 7 + (beg >> 17)\n",
	" if beg >> 20 == end >> 20: \n",
	" return ((1 << 9)-1) // 7 + (beg >> 20)\n",
	" if beg >> 23 == end >> 23: \n",
	" return ((1 << 6)-1) // 7 + (beg >> 23)\n",
	" if beg >> 26 == end >> 26: \n",
	" return ((1 << 3)-1) // 7 + (beg >> 26)\n",
	" return 0\n",
	"\n",
	"\n",
	"\n",
	"\n",
	"@numba.jit\n",
	"def reg2bins(rbeg, rend):\n",
	" lst = []\n",
	"\n",
	" rend -= 1 \n",
	" k = 1 + (rbeg >> 26)\n",
	" while k <= (1 + (rend >> 26)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 9 + (rbeg >> 23)\n",
	" while k <= (9 + (rend >> 23)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 73 + (rbeg >> 20)\n",
	" while k <= (73 + (rend >> 20)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 585 + (rbeg >> 17)\n",
	" while k <= (585 + (rend >> 17)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 4681 + (rbeg >> 14)\n",
	" while k <= (4681 + (rend >> 14)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" return lst\n",
	"\n",
	"\n",
	"\n",
	"def range_partition(start, stop, step):\n",
	" return ((i, min(i+step, stop))\n",
	" for i in range(start, stop, step))\n",
	"\n",
	"\n",
	"def _fetch_region(filepath, chromsizes, slc, chrom1, chrom2=None, \n",
	" columns=None, usecols=None, meta=None):\n",
	" if chrom2 is None:\n",
	" chrom2 = chrom1\n",
	" if slc is None:\n",
	" start, end = 0, chromsizes[chrom1]\n",
	" else:\n",
	" start, end = slc.start, slc.stop\n",
	" f = pypairix.open(filepath, 'r')\n",
	" df = pd.DataFrame.from_records(\n",
	" f.query2D(chrom1, start, end, chrom2, 0, chromsizes[chrom2]), \n",
	" columns=columns)\n",
	"\n",
	" if not len(df):\n",
	" df = meta.copy()\n",
	" elif usecols is not None:\n",
	" usecols = set(usecols)\n",
	" df = df[[col for col in meta.columns if col in usecols]]\n",
	"\n",
	" for col, dt in meta.dtypes.items():\n",
	" df.loc[:, col] = df.loc[:, col].astype(dt)\n",
	"\n",
	" # nasty hack!\n",
	" if len(df) == 0:\n",
	" class fake_loc:\n",
	" def __init__(self, obj):\n",
	" self.obj = obj\n",
	" def __call__(self, *args):\n",
	" return self.obj\n",
	" def __getitem__(self, *args):\n",
	" return self.obj\n",
	" df._loc = fake_loc(df)\n",
	" return df\n",
	"\n",
	"\n",
	"def daskify_pairix_block(filepath, chromsizes, chrom1, chrom2=None, \n",
	" columns=None, dtypes=None, usecols=None, \n",
	" chunk_level=2):\n",
	" nrows = chromsizes[chrom1]\n",
	" meta = pd.read_csv(\n",
	" filepath, \n",
	" sep='\\t', \n",
	" comment='#', \n",
	" header=None,\n",
	" names=columns,\n",
	" dtype=dtypes,\n",
	" usecols=usecols,\n",
	" iterator=True).read(1024).iloc[0:0]\n",
	"\n",
	" # Make a unique task name\n",
	" token = tokenize(filepath, chromsizes, chrom1, chrom2, \n",
	" columns, dtypes, usecols, chunk_level)\n",
	" task_name = 'daskify-pairix-' + token\n",
	"\n",
	" # Build the task graph\n",
	" divisions = []\n",
	" dsk = {}\n",
	" edges = LEVEL[chunk_level]\n",
	" edges = edges[:np.searchsorted(edges, nrows)]\n",
	" if edges[-1] != nrows:\n",
	" edges = np.r_[edges, nrows]\n",
	" spans = zip(edges[:-1], edges[1:])\n",
	"\n",
	" \n",
	" for i, (lo, hi) in enumerate(spans):\n",
	" divisions.append(hi-1)\n",
	" slc = slice(lo, hi)\n",
	" dsk[task_name, i] = (_fetch_region, \n",
	" filepath, chromsizes, slc, \n",
	" chrom1, chrom2, columns, usecols, meta)\n",
	" \n",
	" # Generate ddf from dask graph\n",
	" #return dd.DataFrame(dsk, task_name, meta, tuple(divisions))\n",
	" return new_dd_object(dsk, task_name, meta, tuple(divisions))\n",
	"\n",
	" #return dd.DataFrame(dsk)\n",
	"\n",
	"\n",
	"def daskify_pairix(filepath, chromsizes, **kwargs):\n",
	" f = pypairix.open(filepath)\n",
	" blocks = [s.split('\|') for s in f.get_blocknames()]\n",
	" d = OrderedDict()\n",
	" for chrom1, chrom2 in blocks:\n",
	" if chrom1 in chromsizes and chrom2 in chromsizes:\n",
	" d[chrom1, chrom2] = daskify_pairix_block(\n",
	" filepath, chromsizes, chrom1, chrom2, **kwargs)\n",
	" return d"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "25da0068",
	"metadata": {},
	"outputs": [],
	"source": [
	"pairs = daskify_pairix(\n",
	" 'NIPBL_R1.nodups.pairs.gz', \n",
	" chromsizes,\n",
	" chunk_level=0, # LEVEL 0\n",
	" columns=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type']\n",
	")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "b657f9d4",
	"metadata": {},
	"outputs": [],
	"source": [
	"import itertools\n",
	"pairs_concat = dd.from_delayed(list(itertools.chain.from_iterable(pairs[key].to_delayed() for key in pairs)))\n",
	"pairs_concat = pairs_concat.astype({'pos1': 'int64', 'pos2': 'int64'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "9e866592",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=382</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>int64</td>\n",
	" <td>string</td>\n",
	" <td>int64</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<div>Dask Name: astype, 3 expressions</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=382 \n",
	" string string int64 string int64 string string string\n",
	" ... ... ... ... ... ... ... ...\n",
	"... ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	"Dask Name: astype, 3 expressions\n",
	"Expr=AsType(frame=FromDelayed(1a2469e), dtypes={'pos1': 'int64', 'pos2': 'int64'})"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pairs_concat"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"id": "a331f83f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000037</td>\n",
	" <td>chr1</td>\n",
	" <td>3139797</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000088</td>\n",
	" <td>chr1</td>\n",
	" <td>42957984</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000112</td>\n",
	" <td>chr1</td>\n",
	" <td>5448692</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000227</td>\n",
	" <td>chr1</td>\n",
	" <td>27069655</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000228</td>\n",
	" <td>chr1</td>\n",
	" <td>87485253</td>\n",
	" <td>+</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3921</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133707813</td>\n",
	" <td>chrY</td>\n",
	" <td>581668</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3922</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133811937</td>\n",
	" <td>chrY</td>\n",
	" <td>2889436</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3923</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133914589</td>\n",
	" <td>chrY</td>\n",
	" <td>2899409</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3924</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133920996</td>\n",
	" <td>chrY</td>\n",
	" <td>2897236</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3925</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>134027770</td>\n",
	" <td>chrY</td>\n",
	" <td>2887831</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>45384978 rows × 8 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"0 . chr1 3000037 chr1 3139797 - - LL\n",
	"1 . chr1 3000088 chr1 42957984 - + LL\n",
	"2 . chr1 3000112 chr1 5448692 - + LL\n",
	"3 . chr1 3000227 chr1 27069655 + - LL\n",
	"4 . chr1 3000228 chr1 87485253 + + LL\n",
	"... ... ... ... ... ... ... ... ...\n",
	"3921 . chrX 133707813 chrY 581668 + - LL\n",
	"3922 . chrX 133811937 chrY 2889436 - + LL\n",
	"3923 . chrX 133914589 chrY 2899409 + - LL\n",
	"3924 . chrX 133920996 chrY 2897236 - - LL\n",
	"3925 . chrX 134027770 chrY 2887831 - - LL\n",
	"\n",
	"[45384978 rows x 8 columns]"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pairs_concat.compute()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "4c0f077b",
	"metadata": {},
	"outputs": [],
	"source": [
	"#ADDED SORT\n",
	"# Sort Time+Compute=7m 50s; to parquet=21s\n",
	"# Sort: 0s, Compute+to parquet: 14m\n",
	"sorted_df = pairs_concat.sort_values(by=['chrom1', 'chrom2', 'pos1', 'pos2', 'strand1', 'strand2'])\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"id": "cbdc3013",
	"metadata": {},
	"outputs": [],
	"source": [
	"#Each partition will be written to a separate file. => Not what we are looking for\n",
	"\"\"\" \n",
	"path : string or pathlib.Path\n",
	" Destination directory for data. Prepend with protocol like ``s3://``\n",
	" or ``hdfs://`` for remote data.\n",
	"compression : string or dict, default 'snappy'\n",
	" Either a string like ``\"snappy\"`` or a dictionary mapping column names\n",
	" to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n",
	" to ``\"snappy\"``.\n",
	"compute : bool, default True\n",
	" If ``True`` (default) then the result is computed immediately. If\n",
	" ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n",
	" future computation.\n",
	"schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n",
	" Global schema to use for the output dataset. Defaults to \"infer\", which\n",
	" will infer the schema from the dask dataframe metadata. This is usually\n",
	" sufficient for common schemas, but notably will fail for ``object``\n",
	" dtype columns that contain things other than strings. These columns\n",
	" will require an explicit schema be specified. The schema for a subset\n",
	" of columns can be overridden by passing in a dict of column names to\n",
	" pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n",
	" not present in this dict will still be automatically inferred.\n",
	" Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n",
	" no schema inference will be done. Passing in ``schema=None`` will\n",
	" disable the use of a global file schema - each written file may use a\n",
	" different schema dependent on the dtypes of the corresponding\n",
	" partition.\n",
	"\"\"\"\n",
	"\n",
	"out = sorted_df.to_parquet(\n",
	" 'sortCompute.parquet', \n",
	" compression='snappy', compute = True) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"id": "f15833ff",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=19</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>int64</td>\n",
	" <td>string</td>\n",
	" <td>int64</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<div>Dask Name: read_parquet, 1 expression</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=19 \n",
	" string string int64 string int64 string string string\n",
	" ... ... ... ... ... ... ... ...\n",
	"... ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	"Dask Name: read_parquet, 1 expression\n",
	"Expr=ReadParquetFSSpec(d6dc111)"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = dd.read_parquet('sortCompute.parquet')\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"id": "a9811098",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000037</td>\n",
	" <td>chr1</td>\n",
	" <td>3139797</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000088</td>\n",
	" <td>chr1</td>\n",
	" <td>42957984</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000112</td>\n",
	" <td>chr1</td>\n",
	" <td>5448692</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000227</td>\n",
	" <td>chr1</td>\n",
	" <td>27069655</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>.</td>\n",
	" <td>chr1</td>\n",
	" <td>3000228</td>\n",
	" <td>chr1</td>\n",
	" <td>87485253</td>\n",
	" <td>+</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3921</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133707813</td>\n",
	" <td>chrY</td>\n",
	" <td>581668</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3922</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133811937</td>\n",
	" <td>chrY</td>\n",
	" <td>2889436</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3923</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133914589</td>\n",
	" <td>chrY</td>\n",
	" <td>2899409</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3924</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>133920996</td>\n",
	" <td>chrY</td>\n",
	" <td>2897236</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3925</th>\n",
	" <td>.</td>\n",
	" <td>chrX</td>\n",
	" <td>134027770</td>\n",
	" <td>chrY</td>\n",
	" <td>2887831</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>45384978 rows × 8 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"0 . chr1 3000037 chr1 3139797 - - LL\n",
	"1 . chr1 3000088 chr1 42957984 - + LL\n",
	"2 . chr1 3000112 chr1 5448692 - + LL\n",
	"3 . chr1 3000227 chr1 27069655 + - LL\n",
	"4 . chr1 3000228 chr1 87485253 + + LL\n",
	"... ... ... ... ... ... ... ... ...\n",
	"3921 . chrX 133707813 chrY 581668 + - LL\n",
	"3922 . chrX 133811937 chrY 2889436 - + LL\n",
	"3923 . chrX 133914589 chrY 2899409 + - LL\n",
	"3924 . chrX 133920996 chrY 2897236 - - LL\n",
	"3925 . chrX 134027770 chrY 2887831 - - LL\n",
	"\n",
	"[45384978 rows x 8 columns]"
	]
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.compute()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "main",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.19"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}