ayaksvals · August 7, 2024 14:48
diff --git a/pairsToParquet(LazyDaskVersion).ipynb b/pairsToParquet(LazyDaskVersion).ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d30d1c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import bioframe\n",
    "import pypairix\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4efa19c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "pypairix.build_index('NIPBL_R1.nodups.pairs.gz', force=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a8a4bb79",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "name\n",
       "chr1     197195432\n",
       "chr2     181748087\n",
       "chr3     159599783\n",
       "chr4     155630120\n",
       "chr5     152537259\n",
       "chr6     149517037\n",
       "chr7     152524553\n",
       "chr8     131738871\n",
       "chr9     124076172\n",
       "chr10    129993255\n",
       "chr11    121843856\n",
       "chr12    121257530\n",
       "chr13    120284312\n",
       "chr14    125194864\n",
       "chr15    103494974\n",
       "chr16     98319150\n",
       "chr17     95272651\n",
       "chr18     90772031\n",
       "chr19     61342430\n",
       "chrX     166650296\n",
       "chrY      15902555\n",
       "chrM         16299\n",
       "Name: length, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chromsizes = bioframe.fetch_chromsizes('mm9')\n",
    "chromsizes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "62a9db8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import division, print_function, absolute_import\n",
    "from collections import OrderedDict\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import numba\n",
    "\n",
    "import pypairix\n",
    "import pysam\n",
    "from dask.base import tokenize\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da\n",
    "import dask\n",
    "from dask.dataframe.core import new_dd_object\n",
    "\n",
    "def bin2start(k):\n",
    "    lev = np.floor(np.log2(7*k + 1)/3).astype(int)\n",
    "    sl = 2**(29 - 3*lev)\n",
    "    ol = (2**(3*lev) - 1)//7\n",
    "    start = (k - ol) * sl\n",
    "    end = (k - ol+1) * sl\n",
    "    return start\n",
    "\n",
    "LEVEL = {}\n",
    "LEVEL[0] = bin2start(np.arange(1, 9))\n",
    "LEVEL[1] = bin2start(np.arange(9, 73))\n",
    "LEVEL[2] = bin2start(np.arange(73,585))\n",
    "LEVEL[3] = bin2start(np.arange(585,4681))\n",
    "LEVEL[4] = bin2start(np.arange(4681,37449))\n",
    "\n",
    "\n",
    "@numba.jit(\"int32(int32, int32)\")\n",
    "def reg2bin(beg, end):\n",
    "    end -= 1\n",
    "    if beg >> 14 == end >> 14: \n",
    "        return ((1 << 15)-1) // 7 + (beg >> 14)\n",
    "    if beg >> 17 == end >> 17: \n",
    "        return ((1 << 12)-1) // 7 + (beg >> 17)\n",
    "    if beg >> 20 == end >> 20: \n",
    "        return ((1 << 9)-1) // 7 + (beg >> 20)\n",
    "    if beg >> 23 == end >> 23: \n",
    "        return ((1 << 6)-1) // 7 + (beg >> 23)\n",
    "    if beg >> 26 == end >> 26: \n",
    "        return ((1 << 3)-1) // 7 + (beg >> 26)\n",
    "    return 0\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "@numba.jit\n",
    "def reg2bins(rbeg, rend):\n",
    "    lst = []\n",
    "\n",
    "    rend -= 1    \n",
    "    k = 1 + (rbeg >> 26)\n",
    "    while k <= (1 + (rend >> 26)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 9 + (rbeg >> 23)\n",
    "    while k <= (9 + (rend >> 23)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 73 + (rbeg >> 20)\n",
    "    while k <= (73 + (rend >> 20)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 585 + (rbeg >> 17)\n",
    "    while k <= (585 + (rend >> 17)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    k = 4681 + (rbeg >> 14)\n",
    "    while k <= (4681 + (rend >> 14)):\n",
    "        k += 1\n",
    "        lst.append(k)\n",
    "\n",
    "    return lst\n",
    "\n",
    "\n",
    "\n",
    "def range_partition(start, stop, step):\n",
    "    return ((i, min(i+step, stop))\n",
    "                for i in range(start, stop, step))\n",
    "\n",
    "\n",
    "def _fetch_region(filepath, chromsizes, slc, chrom1, chrom2=None, \n",
    "                 columns=None, usecols=None, meta=None):\n",
    "    if chrom2 is None:\n",
    "        chrom2 = chrom1\n",
    "    if slc is None:\n",
    "        start, end = 0, chromsizes[chrom1]\n",
    "    else:\n",
    "        start, end = slc.start, slc.stop\n",
    "    f = pypairix.open(filepath, 'r')\n",
    "    df = pd.DataFrame.from_records(\n",
    "        f.query2D(chrom1, start, end, chrom2, 0, chromsizes[chrom2]), \n",
    "        columns=columns)\n",
    "\n",
    "    if not len(df):\n",
    "        df = meta.copy()\n",
    "    elif usecols is not None:\n",
    "        usecols = set(usecols)\n",
    "        df = df[[col for col in meta.columns if col in usecols]]\n",
    "\n",
    "    for col, dt in meta.dtypes.items():\n",
    "        df.loc[:, col] = df.loc[:, col].astype(dt)\n",
    "\n",
    "    # nasty hack!\n",
    "    if len(df) == 0:\n",
    "        class fake_loc:\n",
    "            def __init__(self, obj):\n",
    "                self.obj = obj\n",
    "            def __call__(self, *args):\n",
    "                return self.obj\n",
    "            def __getitem__(self, *args):\n",
    "                return self.obj\n",
    "        df._loc = fake_loc(df)\n",
    "    return df\n",
    "\n",
    "\n",
    "def daskify_pairix_block(filepath, chromsizes, chrom1, chrom2=None, \n",
    "                         columns=None, dtypes=None, usecols=None, \n",
    "                         chunk_level=2):\n",
    "    nrows = chromsizes[chrom1]\n",
    "    meta = pd.read_csv(\n",
    "        filepath, \n",
    "        sep='\\t', \n",
    "        comment='#', \n",
    "        header=None,\n",
    "        names=columns,\n",
    "        dtype=dtypes,\n",
    "        usecols=usecols,\n",
    "        iterator=True).read(1024).iloc[0:0]\n",
    "\n",
    "    # Make a unique task name\n",
    "    token = tokenize(filepath, chromsizes, chrom1, chrom2, \n",
    "                     columns, dtypes, usecols, chunk_level)\n",
    "    task_name = 'daskify-pairix-' + token\n",
    "\n",
    "    # Build the task graph\n",
    "    divisions = []\n",
    "    dsk = {}\n",
    "    edges = LEVEL[chunk_level]\n",
    "    edges = edges[:np.searchsorted(edges, nrows)]\n",
    "    if edges[-1] != nrows:\n",
    "        edges = np.r_[edges, nrows]\n",
    "    spans = zip(edges[:-1], edges[1:])\n",
    "\n",
    "    \n",
    "    for i, (lo, hi) in enumerate(spans):\n",
    "        divisions.append(hi-1)\n",
    "        slc = slice(lo, hi)\n",
    "        dsk[task_name, i] = (_fetch_region, \n",
    "                             filepath, chromsizes, slc, \n",
    "                             chrom1, chrom2, columns, usecols, meta)\n",
    "    \n",
    "    # Generate ddf from dask graph\n",
    "    #return dd.DataFrame(dsk, task_name, meta, tuple(divisions))\n",
    "    return new_dd_object(dsk, task_name, meta, tuple(divisions))\n",
    "\n",
    "    #return dd.DataFrame(dsk)\n",
    "\n",
    "\n",
    "def daskify_pairix(filepath, chromsizes, **kwargs):\n",
    "    f = pypairix.open(filepath)\n",
    "    blocks = [s.split('|') for s in f.get_blocknames()]\n",
    "    d = OrderedDict()\n",
    "    for chrom1, chrom2 in blocks:\n",
    "        if chrom1 in chromsizes and chrom2 in chromsizes:\n",
    "            d[chrom1, chrom2] = daskify_pairix_block(\n",
    "                filepath, chromsizes, chrom1, chrom2, **kwargs)\n",
    "    return d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "25da0068",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('chr1', 'chr1') lalala Dask DataFrame Structure:\n",
      "              read_id  chrom1   pos1  chrom2   pos2 strand1 strand2 pair_type\n",
      "npartitions=2                                                                \n",
      "67108863       string  string  int64  string  int64  string  string    string\n",
      "134217727         ...     ...    ...     ...    ...     ...     ...       ...\n",
      "197195431         ...     ...    ...     ...    ...     ...     ...       ...\n",
      "Dask Name: to_pyarrow_string, 2 graph layers\n",
      "('chr1', 'chr10') lalala Dask DataFrame Structure:\n",
      "              read_id  chrom1   pos1  chrom2   pos2 strand1 strand2 pair_type\n",
      "npartitions=2                                                                \n",
      "67108863       string  string  int64  string  int64  string  string    string\n",
      "134217727         ...     ...    ...     ...    ...     ...     ...       ...\n",
      "197195431         ...     ...    ...     ...    ...     ...     ...       ...\n",
      "Dask Name: to_pyarrow_string, 2 graph layers\n",
      "('chr1', 'chr11') lalala Dask DataFrame Structure:\n",
      "              read_id  chrom1   pos1  chrom2   pos2 strand1 strand2 pair_type\n",
      "npartitions=2                                                                \n",
      "67108863       string  string  int64  string  int64  string  string    string\n",
      "134217727         ...     ...    ...     ...    ...     ...     ...       ...\n",
      "197195431         ...     ...    ...     ...    ...     ...     ...       ...\n",
      "Dask Name: to_pyarrow_string, 2 graph layers\n"
     ]
    }
   ],
   "source": [
    "pairs = daskify_pairix(\n",
    "    'NIPBL_R1.nodups.pairs.gz', \n",
    "    chromsizes,\n",
    "    chunk_level=0,  # LEVEL 0\n",
    "    columns=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type']\n",
    ")\n",
    "\n",
    "counter=0\n",
    "for key, value in pairs.items():\n",
    "    print(key, \"lalala\", value)\n",
    "    counter+=1\n",
    "    if counter==3:\n",
    "        break\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b657f9d4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=382</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<div>Dask Name: fromdelayed, 2 expressions</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "                read_id  chrom1    pos1  chrom2    pos2 strand1 strand2 pair_type\n",
       "npartitions=382                                                                  \n",
       "                 string  string  string  string  string  string  string    string\n",
       "                    ...     ...     ...     ...     ...     ...     ...       ...\n",
       "...                 ...     ...     ...     ...     ...     ...     ...       ...\n",
       "                    ...     ...     ...     ...     ...     ...     ...       ...\n",
       "                    ...     ...     ...     ...     ...     ...     ...       ...\n",
       "Dask Name: fromdelayed, 2 expressions\n",
       "Expr=FromDelayed(1a2469e)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import itertools\n",
    "pairs_concat = dd.from_delayed(list(itertools.chain.from_iterable(pairs[key].to_delayed() for key in pairs)))\n",
    "pairs_concat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e482dd65",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dask_expr._collection.DataFrame"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(pairs_concat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b79bef33",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 read_id\n",
      "<class 'str'>\n",
      "1 chrom1\n",
      "<class 'str'>\n",
      "2 pos1\n",
      "<class 'str'>\n",
      "3 chrom2\n",
      "<class 'str'>\n",
      "4 pos2\n",
      "<class 'str'>\n",
      "5 strand1\n",
      "<class 'str'>\n",
      "6 strand2\n",
      "<class 'str'>\n",
      "7 pair_type\n",
      "<class 'str'>\n"
     ]
    }
   ],
   "source": [
    "#doesnt show what is inside the table, because we have not computed it, so shows only column names and types\n",
    "counter=0\n",
    "for i, piece in enumerate(pairs_concat):\n",
    "    print(i,piece)\n",
    "    print(type(piece))\n",
    "    counter+=1\n",
    "    if counter>=60:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cbdc3013",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Each partition will be written to a separate file. => Not what we are looking for\n",
    "\"\"\" \n",
    "path : string or pathlib.Path\n",
    "        Destination directory for data.  Prepend with protocol like ``s3://``\n",
    "        or ``hdfs://`` for remote data.\n",
    "compression : string or dict, default 'snappy'\n",
    "        Either a string like ``\"snappy\"`` or a dictionary mapping column names\n",
    "        to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n",
    "        to ``\"snappy\"``.\n",
    "compute : bool, default True\n",
    "        If ``True`` (default) then the result is computed immediately. If\n",
    "        ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n",
    "        future computation.\n",
    "schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n",
    "        Global schema to use for the output dataset. Defaults to \"infer\", which\n",
    "        will infer the schema from the dask dataframe metadata. This is usually\n",
    "        sufficient for common schemas, but notably will fail for ``object``\n",
    "        dtype columns that contain things other than strings. These columns\n",
    "        will require an explicit schema be specified. The schema for a subset\n",
    "        of columns can be overridden by passing in a dict of column names to\n",
    "        pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n",
    "        not present in this dict will still be automatically inferred.\n",
    "        Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n",
    "        no schema inference will be done. Passing in ``schema=None`` will\n",
    "        disable the use of a global file schema - each written file may use a\n",
    "        different schema dependent on the dtypes of the corresponding\n",
    "        partition.\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "out = pairs_concat.to_parquet(\n",
    "    'save_it/please/', \n",
    "    compression='snappy', compute=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "40f6bf9f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dask_expr._collection.Scalar"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(out)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0572b4c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "import glob\n",
    "\n",
    "# Path to the folder containing Parquet files\n",
    "folder_path = \"/users/slavska.olesia/save_it/please\"\n",
    "\n",
    "# Use glob to find all Parquet files in the folder\n",
    "parquet_files = glob.glob(f\"{folder_path}/*.parquet\")\n",
    "\n",
    "# Read and concatenate all Parquet files into a single DataFrame\n",
    "df = pl.concat([pl.read_parquet(file) for file in parquet_files])\n",
    "\n",
    "# Save the concatenated DataFrame to a new Parquet file if needed\n",
    "df.write_parquet(\"concatenated.parquet\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f15833ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=6</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "      <td>string</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<div>Dask Name: read_parquet, 1 expression</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "              read_id  chrom1    pos1  chrom2    pos2 strand1 strand2 pair_type\n",
       "npartitions=6                                                                  \n",
       "               string  string  string  string  string  string  string    string\n",
       "                  ...     ...     ...     ...     ...     ...     ...       ...\n",
       "...               ...     ...     ...     ...     ...     ...     ...       ...\n",
       "                  ...     ...     ...     ...     ...     ...     ...       ...\n",
       "                  ...     ...     ...     ...     ...     ...     ...       ...\n",
       "Dask Name: read_parquet, 1 expression\n",
       "Expr=ReadParquetFSSpec(824662f)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = dd.read_parquet('concatenated.parquet')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a9811098",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>read_id</th>\n",
       "      <th>chrom1</th>\n",
       "      <th>pos1</th>\n",
       "      <th>chrom2</th>\n",
       "      <th>pos2</th>\n",
       "      <th>strand1</th>\n",
       "      <th>strand2</th>\n",
       "      <th>pair_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>.</td>\n",
       "      <td>chr14</td>\n",
       "      <td>4789556</td>\n",
       "      <td>chrM</td>\n",
       "      <td>11976</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>.</td>\n",
       "      <td>chr14</td>\n",
       "      <td>8277792</td>\n",
       "      <td>chrM</td>\n",
       "      <td>11338</td>\n",
       "      <td>+</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>.</td>\n",
       "      <td>chr14</td>\n",
       "      <td>8537590</td>\n",
       "      <td>chrM</td>\n",
       "      <td>11838</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>chr14</td>\n",
       "      <td>9119244</td>\n",
       "      <td>chrM</td>\n",
       "      <td>11882</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>.</td>\n",
       "      <td>chr14</td>\n",
       "      <td>9384886</td>\n",
       "      <td>chrM</td>\n",
       "      <td>11874</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4328473</th>\n",
       "      <td>.</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67107339</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67119232</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4328474</th>\n",
       "      <td>.</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67107341</td>\n",
       "      <td>chr16</td>\n",
       "      <td>70587067</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4328475</th>\n",
       "      <td>.</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67107420</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67128900</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4328476</th>\n",
       "      <td>.</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67107621</td>\n",
       "      <td>chr16</td>\n",
       "      <td>68638554</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4328477</th>\n",
       "      <td>.</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67107929</td>\n",
       "      <td>chr16</td>\n",
       "      <td>67167258</td>\n",
       "      <td>-</td>\n",
       "      <td>+</td>\n",
       "      <td>LL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>45384978 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        read_id chrom1      pos1 chrom2      pos2 strand1 strand2 pair_type\n",
       "0             .  chr14   4789556   chrM     11976       -       +        LL\n",
       "1             .  chr14   8277792   chrM     11338       +       -        LL\n",
       "2             .  chr14   8537590   chrM     11838       -       -        LL\n",
       "3             .  chr14   9119244   chrM     11882       -       +        LL\n",
       "4             .  chr14   9384886   chrM     11874       -       +        LL\n",
       "...         ...    ...       ...    ...       ...     ...     ...       ...\n",
       "4328473       .  chr16  67107339  chr16  67119232       -       -        LL\n",
       "4328474       .  chr16  67107341  chr16  70587067       -       +        LL\n",
       "4328475       .  chr16  67107420  chr16  67128900       -       -        LL\n",
       "4328476       .  chr16  67107621  chr16  68638554       -       +        LL\n",
       "4328477       .  chr16  67107929  chr16  67167258       -       +        LL\n",
       "\n",
       "[45384978 rows x 8 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.compute()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "main",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d30d1c6f",
	"metadata": {},
	"outputs": [],
	"source": [
	"import bioframe\n",
	"import pypairix\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "4efa19c2",
	"metadata": {},
	"outputs": [],
	"source": [
	"pypairix.build_index('NIPBL_R1.nodups.pairs.gz', force=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "a8a4bb79",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"name\n",
	"chr1 197195432\n",
	"chr2 181748087\n",
	"chr3 159599783\n",
	"chr4 155630120\n",
	"chr5 152537259\n",
	"chr6 149517037\n",
	"chr7 152524553\n",
	"chr8 131738871\n",
	"chr9 124076172\n",
	"chr10 129993255\n",
	"chr11 121843856\n",
	"chr12 121257530\n",
	"chr13 120284312\n",
	"chr14 125194864\n",
	"chr15 103494974\n",
	"chr16 98319150\n",
	"chr17 95272651\n",
	"chr18 90772031\n",
	"chr19 61342430\n",
	"chrX 166650296\n",
	"chrY 15902555\n",
	"chrM 16299\n",
	"Name: length, dtype: int64"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"chromsizes = bioframe.fetch_chromsizes('mm9')\n",
	"chromsizes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "62a9db8f",
	"metadata": {},
	"outputs": [],
	"source": [
	"from __future__ import division, print_function, absolute_import\n",
	"from collections import OrderedDict\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import numba\n",
	"\n",
	"import pypairix\n",
	"import pysam\n",
	"from dask.base import tokenize\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da\n",
	"import dask\n",
	"from dask.dataframe.core import new_dd_object\n",
	"\n",
	"def bin2start(k):\n",
	" lev = np.floor(np.log2(7*k + 1)/3).astype(int)\n",
	" sl = 2*(29 - 3lev)\n",
	" ol = (2*(3lev) - 1)//7\n",
	" start = (k - ol) * sl\n",
	" end = (k - ol+1) * sl\n",
	" return start\n",
	"\n",
	"LEVEL = {}\n",
	"LEVEL[0] = bin2start(np.arange(1, 9))\n",
	"LEVEL[1] = bin2start(np.arange(9, 73))\n",
	"LEVEL[2] = bin2start(np.arange(73,585))\n",
	"LEVEL[3] = bin2start(np.arange(585,4681))\n",
	"LEVEL[4] = bin2start(np.arange(4681,37449))\n",
	"\n",
	"\n",
	"@numba.jit(\"int32(int32, int32)\")\n",
	"def reg2bin(beg, end):\n",
	" end -= 1\n",
	" if beg >> 14 == end >> 14: \n",
	" return ((1 << 15)-1) // 7 + (beg >> 14)\n",
	" if beg >> 17 == end >> 17: \n",
	" return ((1 << 12)-1) // 7 + (beg >> 17)\n",
	" if beg >> 20 == end >> 20: \n",
	" return ((1 << 9)-1) // 7 + (beg >> 20)\n",
	" if beg >> 23 == end >> 23: \n",
	" return ((1 << 6)-1) // 7 + (beg >> 23)\n",
	" if beg >> 26 == end >> 26: \n",
	" return ((1 << 3)-1) // 7 + (beg >> 26)\n",
	" return 0\n",
	"\n",
	"\n",
	"\n",
	"\n",
	"@numba.jit\n",
	"def reg2bins(rbeg, rend):\n",
	" lst = []\n",
	"\n",
	" rend -= 1 \n",
	" k = 1 + (rbeg >> 26)\n",
	" while k <= (1 + (rend >> 26)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 9 + (rbeg >> 23)\n",
	" while k <= (9 + (rend >> 23)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 73 + (rbeg >> 20)\n",
	" while k <= (73 + (rend >> 20)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 585 + (rbeg >> 17)\n",
	" while k <= (585 + (rend >> 17)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" k = 4681 + (rbeg >> 14)\n",
	" while k <= (4681 + (rend >> 14)):\n",
	" k += 1\n",
	" lst.append(k)\n",
	"\n",
	" return lst\n",
	"\n",
	"\n",
	"\n",
	"def range_partition(start, stop, step):\n",
	" return ((i, min(i+step, stop))\n",
	" for i in range(start, stop, step))\n",
	"\n",
	"\n",
	"def _fetch_region(filepath, chromsizes, slc, chrom1, chrom2=None, \n",
	" columns=None, usecols=None, meta=None):\n",
	" if chrom2 is None:\n",
	" chrom2 = chrom1\n",
	" if slc is None:\n",
	" start, end = 0, chromsizes[chrom1]\n",
	" else:\n",
	" start, end = slc.start, slc.stop\n",
	" f = pypairix.open(filepath, 'r')\n",
	" df = pd.DataFrame.from_records(\n",
	" f.query2D(chrom1, start, end, chrom2, 0, chromsizes[chrom2]), \n",
	" columns=columns)\n",
	"\n",
	" if not len(df):\n",
	" df = meta.copy()\n",
	" elif usecols is not None:\n",
	" usecols = set(usecols)\n",
	" df = df[[col for col in meta.columns if col in usecols]]\n",
	"\n",
	" for col, dt in meta.dtypes.items():\n",
	" df.loc[:, col] = df.loc[:, col].astype(dt)\n",
	"\n",
	" # nasty hack!\n",
	" if len(df) == 0:\n",
	" class fake_loc:\n",
	" def __init__(self, obj):\n",
	" self.obj = obj\n",
	" def __call__(self, *args):\n",
	" return self.obj\n",
	" def __getitem__(self, *args):\n",
	" return self.obj\n",
	" df._loc = fake_loc(df)\n",
	" return df\n",
	"\n",
	"\n",
	"def daskify_pairix_block(filepath, chromsizes, chrom1, chrom2=None, \n",
	" columns=None, dtypes=None, usecols=None, \n",
	" chunk_level=2):\n",
	" nrows = chromsizes[chrom1]\n",
	" meta = pd.read_csv(\n",
	" filepath, \n",
	" sep='\\t', \n",
	" comment='#', \n",
	" header=None,\n",
	" names=columns,\n",
	" dtype=dtypes,\n",
	" usecols=usecols,\n",
	" iterator=True).read(1024).iloc[0:0]\n",
	"\n",
	" # Make a unique task name\n",
	" token = tokenize(filepath, chromsizes, chrom1, chrom2, \n",
	" columns, dtypes, usecols, chunk_level)\n",
	" task_name = 'daskify-pairix-' + token\n",
	"\n",
	" # Build the task graph\n",
	" divisions = []\n",
	" dsk = {}\n",
	" edges = LEVEL[chunk_level]\n",
	" edges = edges[:np.searchsorted(edges, nrows)]\n",
	" if edges[-1] != nrows:\n",
	" edges = np.r_[edges, nrows]\n",
	" spans = zip(edges[:-1], edges[1:])\n",
	"\n",
	" \n",
	" for i, (lo, hi) in enumerate(spans):\n",
	" divisions.append(hi-1)\n",
	" slc = slice(lo, hi)\n",
	" dsk[task_name, i] = (_fetch_region, \n",
	" filepath, chromsizes, slc, \n",
	" chrom1, chrom2, columns, usecols, meta)\n",
	" \n",
	" # Generate ddf from dask graph\n",
	" #return dd.DataFrame(dsk, task_name, meta, tuple(divisions))\n",
	" return new_dd_object(dsk, task_name, meta, tuple(divisions))\n",
	"\n",
	" #return dd.DataFrame(dsk)\n",
	"\n",
	"\n",
	"def daskify_pairix(filepath, chromsizes, **kwargs):\n",
	" f = pypairix.open(filepath)\n",
	" blocks = [s.split('\|') for s in f.get_blocknames()]\n",
	" d = OrderedDict()\n",
	" for chrom1, chrom2 in blocks:\n",
	" if chrom1 in chromsizes and chrom2 in chromsizes:\n",
	" d[chrom1, chrom2] = daskify_pairix_block(\n",
	" filepath, chromsizes, chrom1, chrom2, **kwargs)\n",
	" return d"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "25da0068",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('chr1', 'chr1') lalala Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=2 \n",
	"67108863 string string int64 string int64 string string string\n",
	"134217727 ... ... ... ... ... ... ... ...\n",
	"197195431 ... ... ... ... ... ... ... ...\n",
	"Dask Name: to_pyarrow_string, 2 graph layers\n",
	"('chr1', 'chr10') lalala Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=2 \n",
	"67108863 string string int64 string int64 string string string\n",
	"134217727 ... ... ... ... ... ... ... ...\n",
	"197195431 ... ... ... ... ... ... ... ...\n",
	"Dask Name: to_pyarrow_string, 2 graph layers\n",
	"('chr1', 'chr11') lalala Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=2 \n",
	"67108863 string string int64 string int64 string string string\n",
	"134217727 ... ... ... ... ... ... ... ...\n",
	"197195431 ... ... ... ... ... ... ... ...\n",
	"Dask Name: to_pyarrow_string, 2 graph layers\n"
	]
	}
	],
	"source": [
	"pairs = daskify_pairix(\n",
	" 'NIPBL_R1.nodups.pairs.gz', \n",
	" chromsizes,\n",
	" chunk_level=0, # LEVEL 0\n",
	" columns=['read_id', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2', 'pair_type']\n",
	")\n",
	"\n",
	"counter=0\n",
	"for key, value in pairs.items():\n",
	" print(key, \"lalala\", value)\n",
	" counter+=1\n",
	" if counter==3:\n",
	" break\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "b657f9d4",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=382</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<div>Dask Name: fromdelayed, 2 expressions</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=382 \n",
	" string string string string string string string string\n",
	" ... ... ... ... ... ... ... ...\n",
	"... ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	"Dask Name: fromdelayed, 2 expressions\n",
	"Expr=FromDelayed(1a2469e)"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import itertools\n",
	"pairs_concat = dd.from_delayed(list(itertools.chain.from_iterable(pairs[key].to_delayed() for key in pairs)))\n",
	"pairs_concat"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "e482dd65",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dask_expr._collection.DataFrame"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"type(pairs_concat)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "b79bef33",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0 read_id\n",
	"<class 'str'>\n",
	"1 chrom1\n",
	"<class 'str'>\n",
	"2 pos1\n",
	"<class 'str'>\n",
	"3 chrom2\n",
	"<class 'str'>\n",
	"4 pos2\n",
	"<class 'str'>\n",
	"5 strand1\n",
	"<class 'str'>\n",
	"6 strand2\n",
	"<class 'str'>\n",
	"7 pair_type\n",
	"<class 'str'>\n"
	]
	}
	],
	"source": [
	"#doesnt show what is inside the table, because we have not computed it, so shows only column names and types\n",
	"counter=0\n",
	"for i, piece in enumerate(pairs_concat):\n",
	" print(i,piece)\n",
	" print(type(piece))\n",
	" counter+=1\n",
	" if counter>=60:\n",
	" break"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "cbdc3013",
	"metadata": {},
	"outputs": [],
	"source": [
	"#Each partition will be written to a separate file. => Not what we are looking for\n",
	"\"\"\" \n",
	"path : string or pathlib.Path\n",
	" Destination directory for data. Prepend with protocol like ``s3://``\n",
	" or ``hdfs://`` for remote data.\n",
	"compression : string or dict, default 'snappy'\n",
	" Either a string like ``\"snappy\"`` or a dictionary mapping column names\n",
	" to compressors like ``{\"name\": \"gzip\", \"values\": \"snappy\"}``. Defaults\n",
	" to ``\"snappy\"``.\n",
	"compute : bool, default True\n",
	" If ``True`` (default) then the result is computed immediately. If\n",
	" ``False`` then a ``dask.dataframe.Scalar`` object is returned for\n",
	" future computation.\n",
	"schema : pyarrow.Schema, dict, \"infer\", or None, default \"infer\"\n",
	" Global schema to use for the output dataset. Defaults to \"infer\", which\n",
	" will infer the schema from the dask dataframe metadata. This is usually\n",
	" sufficient for common schemas, but notably will fail for ``object``\n",
	" dtype columns that contain things other than strings. These columns\n",
	" will require an explicit schema be specified. The schema for a subset\n",
	" of columns can be overridden by passing in a dict of column names to\n",
	" pyarrow types (for example ``schema={\"field\": pa.string()}``); columns\n",
	" not present in this dict will still be automatically inferred.\n",
	" Alternatively, a full ``pyarrow.Schema`` may be passed, in which case\n",
	" no schema inference will be done. Passing in ``schema=None`` will\n",
	" disable the use of a global file schema - each written file may use a\n",
	" different schema dependent on the dtypes of the corresponding\n",
	" partition.\n",
	"\"\"\"\n",
	"\n",
	"\n",
	"out = pairs_concat.to_parquet(\n",
	" 'save_it/please/', \n",
	" compression='snappy', compute=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "40f6bf9f",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dask_expr._collection.Scalar"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"type(out)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "0572b4c5",
	"metadata": {},
	"outputs": [],
	"source": [
	"import polars as pl\n",
	"import glob\n",
	"\n",
	"# Path to the folder containing Parquet files\n",
	"folder_path = \"/users/slavska.olesia/save_it/please\"\n",
	"\n",
	"# Use glob to find all Parquet files in the folder\n",
	"parquet_files = glob.glob(f\"{folder_path}/*.parquet\")\n",
	"\n",
	"# Read and concatenate all Parquet files into a single DataFrame\n",
	"df = pl.concat([pl.read_parquet(file) for file in parquet_files])\n",
	"\n",
	"# Save the concatenated DataFrame to a new Parquet file if needed\n",
	"df.write_parquet(\"concatenated.parquet\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "f15833ff",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><strong>Dask DataFrame Structure:</strong></div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>npartitions=6</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" <td>string</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th></th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<div>Dask Name: read_parquet, 1 expression</div>"
	],
	"text/plain": [
	"Dask DataFrame Structure:\n",
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"npartitions=6 \n",
	" string string string string string string string string\n",
	" ... ... ... ... ... ... ... ...\n",
	"... ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	" ... ... ... ... ... ... ... ...\n",
	"Dask Name: read_parquet, 1 expression\n",
	"Expr=ReadParquetFSSpec(824662f)"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = dd.read_parquet('concatenated.parquet')\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "a9811098",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>read_id</th>\n",
	" <th>chrom1</th>\n",
	" <th>pos1</th>\n",
	" <th>chrom2</th>\n",
	" <th>pos2</th>\n",
	" <th>strand1</th>\n",
	" <th>strand2</th>\n",
	" <th>pair_type</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>.</td>\n",
	" <td>chr14</td>\n",
	" <td>4789556</td>\n",
	" <td>chrM</td>\n",
	" <td>11976</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>.</td>\n",
	" <td>chr14</td>\n",
	" <td>8277792</td>\n",
	" <td>chrM</td>\n",
	" <td>11338</td>\n",
	" <td>+</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>.</td>\n",
	" <td>chr14</td>\n",
	" <td>8537590</td>\n",
	" <td>chrM</td>\n",
	" <td>11838</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>.</td>\n",
	" <td>chr14</td>\n",
	" <td>9119244</td>\n",
	" <td>chrM</td>\n",
	" <td>11882</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>.</td>\n",
	" <td>chr14</td>\n",
	" <td>9384886</td>\n",
	" <td>chrM</td>\n",
	" <td>11874</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4328473</th>\n",
	" <td>.</td>\n",
	" <td>chr16</td>\n",
	" <td>67107339</td>\n",
	" <td>chr16</td>\n",
	" <td>67119232</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4328474</th>\n",
	" <td>.</td>\n",
	" <td>chr16</td>\n",
	" <td>67107341</td>\n",
	" <td>chr16</td>\n",
	" <td>70587067</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4328475</th>\n",
	" <td>.</td>\n",
	" <td>chr16</td>\n",
	" <td>67107420</td>\n",
	" <td>chr16</td>\n",
	" <td>67128900</td>\n",
	" <td>-</td>\n",
	" <td>-</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4328476</th>\n",
	" <td>.</td>\n",
	" <td>chr16</td>\n",
	" <td>67107621</td>\n",
	" <td>chr16</td>\n",
	" <td>68638554</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4328477</th>\n",
	" <td>.</td>\n",
	" <td>chr16</td>\n",
	" <td>67107929</td>\n",
	" <td>chr16</td>\n",
	" <td>67167258</td>\n",
	" <td>-</td>\n",
	" <td>+</td>\n",
	" <td>LL</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>45384978 rows × 8 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" read_id chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type\n",
	"0 . chr14 4789556 chrM 11976 - + LL\n",
	"1 . chr14 8277792 chrM 11338 + - LL\n",
	"2 . chr14 8537590 chrM 11838 - - LL\n",
	"3 . chr14 9119244 chrM 11882 - + LL\n",
	"4 . chr14 9384886 chrM 11874 - + LL\n",
	"... ... ... ... ... ... ... ... ...\n",
	"4328473 . chr16 67107339 chr16 67119232 - - LL\n",
	"4328474 . chr16 67107341 chr16 70587067 - + LL\n",
	"4328475 . chr16 67107420 chr16 67128900 - - LL\n",
	"4328476 . chr16 67107621 chr16 68638554 - + LL\n",
	"4328477 . chr16 67107929 chr16 67167258 - + LL\n",
	"\n",
	"[45384978 rows x 8 columns]"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.compute()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "main",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.19"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}