ayaksvals · August 19, 2024 17:23
diff --git a/parquet_sort(PolarsVersion).ipynb b/parquet_sort(PolarsVersion).ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d30d1c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import bioframe\n",
    "import pypairix\n",
    "import dask.dataframe as dd\n",
    "import dask.array as da\n",
    "import polars as pl\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f15833ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 151 µs, sys: 49 µs, total: 200 µs\n",
      "Wall time: 208 µs\n"
     ]
    }
   ],
   "source": [
    "# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
    "\n",
    "%time df = pl.scan_parquet('pairsToPolarsSnappy.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "4be098d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def parse_chromosome(chromosome):\n",
    "    # Default values for sorting\n",
    "    primary = 100\n",
    "    secondary = 10000\n",
    "    tertiary = \"\"\n",
    "    \n",
    "    # Match patterns like \"chr1\", \"chrX\", \"chr4_GL456216_random\"\n",
    "    match = re.match(r'chr(\\d+|X|Y|Un)(?:_GL(\\d+)|.*)?(_random)?', chromosome)\n",
    "    if match:\n",
    "        main_part, gl_part, random_part = match.groups()\n",
    "        \n",
    "        # Primary sorting key\n",
    "        if main_part.isdigit():\n",
    "            primary = int(main_part)\n",
    "        elif main_part == 'M':\n",
    "            primary = 23\n",
    "        elif main_part == 'X':\n",
    "            primary = 24\n",
    "        elif main_part == 'Y':\n",
    "            primary = 25\n",
    "        \n",
    "        # Secondary sorting key\n",
    "        if gl_part:\n",
    "            secondary = int(gl_part)\n",
    "        else:\n",
    "            secondary = 9999\n",
    "        \n",
    "        # Tertiary sorting key\n",
    "        tertiary = random_part if random_part else \"\"\n",
    "\n",
    "    return (primary, secondary, tertiary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "2ca6e93f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.97 ms, sys: 0 ns, total: 1.97 ms\n",
      "Wall time: 1.98 ms\n"
     ]
    }
   ],
   "source": [
    "### SORT 1,2,10\n",
    "\n",
    "filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
    "%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary2\")])\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "7e453593",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 178 µs, sys: 0 ns, total: 178 µs\n",
      "Wall time: 182 µs\n"
     ]
    }
   ],
   "source": [
    "%time sorted_df = filtered_df.sort([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n",
    "\n",
    "sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "9b238ac5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 10min 49s, sys: 1min 10s, total: 12min\n",
      "Wall time: 10min 7s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000228</td><td>&quot;chr1&quot;</td><td>87485253</td><td>&quot;+&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000302</td><td>&quot;chr1&quot;</td><td>3019411</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000898</td><td>&quot;chr1&quot;</td><td>3002388</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001066</td><td>&quot;chr1&quot;</td><td>3001229</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001303</td><td>&quot;chr1&quot;</td><td>3001526</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16213</td><td>&quot;chrM&quot;</td><td>16244</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16217</td><td>&quot;chrM&quot;</td><td>16300</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16223</td><td>&quot;chrM&quot;</td><td>16288</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16243</td><td>&quot;chrM&quot;</td><td>16265</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16272</td><td>&quot;chrM&quot;</td><td>16295</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (65_220_653, 8)\n",
       "┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n",
       "│ read_id ┆ chrom1 ┆ pos1    ┆ chrom2 ┆ pos2     ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
       "│ ---     ┆ ---    ┆ ---     ┆ ---    ┆ ---      ┆ ---     ┆ ---     ┆ ---        │\n",
       "│ str     ┆ str    ┆ u32     ┆ str    ┆ u32      ┆ str     ┆ str     ┆ str        │\n",
       "╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n",
       "│ .       ┆ chr1   ┆ 3000228 ┆ chr1   ┆ 87485253 ┆ +       ┆ +       ┆ LL         │\n",
       "│ .       ┆ chr1   ┆ 3000302 ┆ chr1   ┆ 3019411  ┆ -       ┆ +       ┆ LL         │\n",
       "│ .       ┆ chr1   ┆ 3000898 ┆ chr1   ┆ 3002388  ┆ -       ┆ +       ┆ LL         │\n",
       "│ .       ┆ chr1   ┆ 3001066 ┆ chr1   ┆ 3001229  ┆ +       ┆ -       ┆ LL         │\n",
       "│ .       ┆ chr1   ┆ 3001303 ┆ chr1   ┆ 3001526  ┆ +       ┆ -       ┆ LL         │\n",
       "│ …       ┆ …      ┆ …       ┆ …      ┆ …        ┆ …       ┆ …       ┆ …          │\n",
       "│ .       ┆ chrM   ┆ 16213   ┆ chrM   ┆ 16244    ┆ +       ┆ -       ┆ LL         │\n",
       "│ .       ┆ chrM   ┆ 16217   ┆ chrM   ┆ 16300    ┆ -       ┆ -       ┆ LL         │\n",
       "│ .       ┆ chrM   ┆ 16223   ┆ chrM   ┆ 16288    ┆ -       ┆ -       ┆ LL         │\n",
       "│ .       ┆ chrM   ┆ 16243   ┆ chrM   ┆ 16265    ┆ -       ┆ -       ┆ LL         │\n",
       "│ .       ┆ chrM   ┆ 16272   ┆ chrM   ┆ 16295    ┆ -       ┆ -       ┆ LL         │\n",
       "└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time sorted_df.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "4295e534",
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_df.sink_parquet(\"sortPolars.parquet\", compression=\"snappy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9811098",
   "metadata": {},
   "outputs": [],
   "source": [
    "### QUICKLY SORT 1,10,2\n",
    "\"\"\"sort pairs files \n",
    "    (the lexicographic order for chromosomes, \n",
    "    the numeric order for the positions, \n",
    "    the lexicographic order for pair types).\n",
    "\"\"\"\n",
    "\n",
    "filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
    "filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "570664b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (7, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chromosome</th></tr><tr><td>str</td></tr></thead><tbody><tr><td>&quot;chr1&quot;</td></tr><tr><td>&quot;chr2&quot;</td></tr><tr><td>&quot;chr4_GL456216_random&quot;</td></tr><tr><td>&quot;chr10&quot;</td></tr><tr><td>&quot;chr11&quot;</td></tr><tr><td>&quot;chrX&quot;</td></tr><tr><td>&quot;chrUn_GL456381&quot;</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (7, 1)\n",
       "┌──────────────────────┐\n",
       "│ chromosome           │\n",
       "│ ---                  │\n",
       "│ str                  │\n",
       "╞══════════════════════╡\n",
       "│ chr1                 │\n",
       "│ chr2                 │\n",
       "│ chr4_GL456216_random │\n",
       "│ chr10                │\n",
       "│ chr11                │\n",
       "│ chrX                 │\n",
       "│ chrUn_GL456381       │\n",
       "└──────────────────────┘"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### TEST BLOCK\n",
    "\n",
    "\n",
    "df = pl.DataFrame({\n",
    "    \"chromosome\": [\"chr1\", \"chrX\", \"chr4_GL456216_random\", \"chr10\", \"chrUn_GL456381\", \"chr2\", \"chr11\"]\n",
    "})\n",
    "\n",
    "\n",
    "# Add new columns with parsed values\n",
    "df = df.with_columns([\n",
    "    pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),\n",
    "    pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),\n",
    "    pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\")\n",
    "])\n",
    "\n",
    "# Sort by the extracted columns\n",
    "sorted_df = df.sort([\"primary\", \"secondary\", \"tertiary\"])\n",
    "\n",
    "# Drop helper columns if needed\n",
    "sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\"])\n",
    "\n",
    "sorted_df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "main",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "d30d1c6f",
	"metadata": {},
	"outputs": [],
	"source": [
	"import bioframe\n",
	"import pypairix\n",
	"import dask.dataframe as dd\n",
	"import dask.array as da\n",
	"import polars as pl\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"id": "f15833ff",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 151 µs, sys: 49 µs, total: 200 µs\n",
	"Wall time: 208 µs\n"
	]
	}
	],
	"source": [
	"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
	"\n",
	"%time df = pl.scan_parquet('pairsToPolarsSnappy.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"id": "4be098d1",
	"metadata": {},
	"outputs": [],
	"source": [
	"import re\n",
	"def parse_chromosome(chromosome):\n",
	" # Default values for sorting\n",
	" primary = 100\n",
	" secondary = 10000\n",
	" tertiary = \"\"\n",
	" \n",
	" # Match patterns like \"chr1\", \"chrX\", \"chr4_GL456216_random\"\n",
	" match = re.match(r'chr(\\d+\|X\|Y\|Un)(?:_GL(\\d+)\|.*)?(_random)?', chromosome)\n",
	" if match:\n",
	" main_part, gl_part, random_part = match.groups()\n",
	" \n",
	" # Primary sorting key\n",
	" if main_part.isdigit():\n",
	" primary = int(main_part)\n",
	" elif main_part == 'M':\n",
	" primary = 23\n",
	" elif main_part == 'X':\n",
	" primary = 24\n",
	" elif main_part == 'Y':\n",
	" primary = 25\n",
	" \n",
	" # Secondary sorting key\n",
	" if gl_part:\n",
	" secondary = int(gl_part)\n",
	" else:\n",
	" secondary = 9999\n",
	" \n",
	" # Tertiary sorting key\n",
	" tertiary = random_part if random_part else \"\"\n",
	"\n",
	" return (primary, secondary, tertiary)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "2ca6e93f",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 1.97 ms, sys: 0 ns, total: 1.97 ms\n",
	"Wall time: 1.98 ms\n"
	]
	}
	],
	"source": [
	"### SORT 1,2,10\n",
	"\n",
	"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
	"%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary2\")])\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"id": "7e453593",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 178 µs, sys: 0 ns, total: 178 µs\n",
	"Wall time: 182 µs\n"
	]
	}
	],
	"source": [
	"%time sorted_df = filtered_df.sort([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n",
	"\n",
	"sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\"])\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "9b238ac5",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 10min 49s, sys: 1min 10s, total: 12min\n",
	"Wall time: 10min 7s\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div><style>\n",
	".dataframe > thead > tr,\n",
	".dataframe > tbody > tr {\n",
	" text-align: right;\n",
	" white-space: pre-wrap;\n",
	"}\n",
	"</style>\n",
	"<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>"."</td><td>"chr1"</td><td>3000228</td><td>"chr1"</td><td>87485253</td><td>"+"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3000302</td><td>"chr1"</td><td>3019411</td><td>"-"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3000898</td><td>"chr1"</td><td>3002388</td><td>"-"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3001066</td><td>"chr1"</td><td>3001229</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3001303</td><td>"chr1"</td><td>3001526</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"."</td><td>"chrM"</td><td>16213</td><td>"chrM"</td><td>16244</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16217</td><td>"chrM"</td><td>16300</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16223</td><td>"chrM"</td><td>16288</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16243</td><td>"chrM"</td><td>16265</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16272</td><td>"chrM"</td><td>16295</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr></tbody></table></div>"
	],
	"text/plain": [
	"shape: (65_220_653, 8)\n",
	"┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n",
	"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
	"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
	"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
	"╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n",
	"│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n",
	"│ . ┆ chr1 ┆ 3000302 ┆ chr1 ┆ 3019411 ┆ - ┆ + ┆ LL │\n",
	"│ . ┆ chr1 ┆ 3000898 ┆ chr1 ┆ 3002388 ┆ - ┆ + ┆ LL │\n",
	"│ . ┆ chr1 ┆ 3001066 ┆ chr1 ┆ 3001229 ┆ + ┆ - ┆ LL │\n",
	"│ . ┆ chr1 ┆ 3001303 ┆ chr1 ┆ 3001526 ┆ + ┆ - ┆ LL │\n",
	"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
	"│ . ┆ chrM ┆ 16213 ┆ chrM ┆ 16244 ┆ + ┆ - ┆ LL │\n",
	"│ . ┆ chrM ┆ 16217 ┆ chrM ┆ 16300 ┆ - ┆ - ┆ LL │\n",
	"│ . ┆ chrM ┆ 16223 ┆ chrM ┆ 16288 ┆ - ┆ - ┆ LL │\n",
	"│ . ┆ chrM ┆ 16243 ┆ chrM ┆ 16265 ┆ - ┆ - ┆ LL │\n",
	"│ . ┆ chrM ┆ 16272 ┆ chrM ┆ 16295 ┆ - ┆ - ┆ LL │\n",
	"└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%time sorted_df.collect()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"id": "4295e534",
	"metadata": {},
	"outputs": [],
	"source": [
	"sorted_df.sink_parquet(\"sortPolars.parquet\", compression=\"snappy\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "a9811098",
	"metadata": {},
	"outputs": [],
	"source": [
	"### QUICKLY SORT 1,10,2\n",
	"\"\"\"sort pairs files \n",
	" (the lexicographic order for chromosomes, \n",
	" the numeric order for the positions, \n",
	" the lexicographic order for pair types).\n",
	"\"\"\"\n",
	"\n",
	"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
	"filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"id": "570664b2",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div><style>\n",
	".dataframe > thead > tr,\n",
	".dataframe > tbody > tr {\n",
	" text-align: right;\n",
	" white-space: pre-wrap;\n",
	"}\n",
	"</style>\n",
	"<small>shape: (7, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chromosome</th></tr><tr><td>str</td></tr></thead><tbody><tr><td>"chr1"</td></tr><tr><td>"chr2"</td></tr><tr><td>"chr4_GL456216_random"</td></tr><tr><td>"chr10"</td></tr><tr><td>"chr11"</td></tr><tr><td>"chrX"</td></tr><tr><td>"chrUn_GL456381"</td></tr></tbody></table></div>"
	],
	"text/plain": [
	"shape: (7, 1)\n",
	"┌──────────────────────┐\n",
	"│ chromosome │\n",
	"│ --- │\n",
	"│ str │\n",
	"╞══════════════════════╡\n",
	"│ chr1 │\n",
	"│ chr2 │\n",
	"│ chr4_GL456216_random │\n",
	"│ chr10 │\n",
	"│ chr11 │\n",
	"│ chrX │\n",
	"│ chrUn_GL456381 │\n",
	"└──────────────────────┘"
	]
	},
	"execution_count": 35,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"### TEST BLOCK\n",
	"\n",
	"\n",
	"df = pl.DataFrame({\n",
	" \"chromosome\": [\"chr1\", \"chrX\", \"chr4_GL456216_random\", \"chr10\", \"chrUn_GL456381\", \"chr2\", \"chr11\"]\n",
	"})\n",
	"\n",
	"\n",
	"# Add new columns with parsed values\n",
	"df = df.with_columns([\n",
	" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),\n",
	" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),\n",
	" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\")\n",
	"])\n",
	"\n",
	"# Sort by the extracted columns\n",
	"sorted_df = df.sort([\"primary\", \"secondary\", \"tertiary\"])\n",
	"\n",
	"# Drop helper columns if needed\n",
	"sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\"])\n",
	"\n",
	"sorted_df"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "main",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.19"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found