Last active
August 19, 2024 17:23
-
-
Save ayaksvals/200ae68d5e81e489b52ffb53bedd1988 to your computer and use it in GitHub Desktop.
Sort Parquet with polars 1,2,10 not 1,10,2 (really Long Version)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "d30d1c6f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import bioframe\n", | |
| "import pypairix\n", | |
| "import dask.dataframe as dd\n", | |
| "import dask.array as da\n", | |
| "import polars as pl\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "id": "f15833ff", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 151 µs, sys: 49 µs, total: 200 µs\n", | |
| "Wall time: 208 µs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n", | |
| "\n", | |
| "%time df = pl.scan_parquet('pairsToPolarsSnappy.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "4be098d1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import re\n", | |
| "def parse_chromosome(chromosome):\n", | |
| " # Default values for sorting\n", | |
| " primary = 100\n", | |
| " secondary = 10000\n", | |
| " tertiary = \"\"\n", | |
| " \n", | |
| " # Match patterns like \"chr1\", \"chrX\", \"chr4_GL456216_random\"\n", | |
| " match = re.match(r'chr(\\d+|X|Y|Un)(?:_GL(\\d+)|.*)?(_random)?', chromosome)\n", | |
| " if match:\n", | |
| " main_part, gl_part, random_part = match.groups()\n", | |
| " \n", | |
| " # Primary sorting key\n", | |
| " if main_part.isdigit():\n", | |
| " primary = int(main_part)\n", | |
| " elif main_part == 'M':\n", | |
| " primary = 23\n", | |
| " elif main_part == 'X':\n", | |
| " primary = 24\n", | |
| " elif main_part == 'Y':\n", | |
| " primary = 25\n", | |
| " \n", | |
| " # Secondary sorting key\n", | |
| " if gl_part:\n", | |
| " secondary = int(gl_part)\n", | |
| " else:\n", | |
| " secondary = 9999\n", | |
| " \n", | |
| " # Tertiary sorting key\n", | |
| " tertiary = random_part if random_part else \"\"\n", | |
| "\n", | |
| " return (primary, secondary, tertiary)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "id": "2ca6e93f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 1.97 ms, sys: 0 ns, total: 1.97 ms\n", | |
| "Wall time: 1.98 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "### SORT 1,2,10\n", | |
| "\n", | |
| "filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n", | |
| "%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary2\")])\n", | |
| "\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "id": "7e453593", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 178 µs, sys: 0 ns, total: 178 µs\n", | |
| "Wall time: 182 µs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%time sorted_df = filtered_df.sort([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n", | |
| "\n", | |
| "sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\"])\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "id": "9b238ac5", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 10min 49s, sys: 1min 10s, total: 12min\n", | |
| "Wall time: 10min 7s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div><style>\n", | |
| ".dataframe > thead > tr,\n", | |
| ".dataframe > tbody > tr {\n", | |
| " text-align: right;\n", | |
| " white-space: pre-wrap;\n", | |
| "}\n", | |
| "</style>\n", | |
| "<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>"."</td><td>"chr1"</td><td>3000228</td><td>"chr1"</td><td>87485253</td><td>"+"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3000302</td><td>"chr1"</td><td>3019411</td><td>"-"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3000898</td><td>"chr1"</td><td>3002388</td><td>"-"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3001066</td><td>"chr1"</td><td>3001229</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3001303</td><td>"chr1"</td><td>3001526</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"."</td><td>"chrM"</td><td>16213</td><td>"chrM"</td><td>16244</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16217</td><td>"chrM"</td><td>16300</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16223</td><td>"chrM"</td><td>16288</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16243</td><td>"chrM"</td><td>16265</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16272</td><td>"chrM"</td><td>16295</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr></tbody></table></div>" | |
| ], | |
| "text/plain": [ | |
| "shape: (65_220_653, 8)\n", | |
| "┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n", | |
| "│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n", | |
| "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
| "│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n", | |
| "╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n", | |
| "│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n", | |
| "│ . ┆ chr1 ┆ 3000302 ┆ chr1 ┆ 3019411 ┆ - ┆ + ┆ LL │\n", | |
| "│ . ┆ chr1 ┆ 3000898 ┆ chr1 ┆ 3002388 ┆ - ┆ + ┆ LL │\n", | |
| "│ . ┆ chr1 ┆ 3001066 ┆ chr1 ┆ 3001229 ┆ + ┆ - ┆ LL │\n", | |
| "│ . ┆ chr1 ┆ 3001303 ┆ chr1 ┆ 3001526 ┆ + ┆ - ┆ LL │\n", | |
| "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", | |
| "│ . ┆ chrM ┆ 16213 ┆ chrM ┆ 16244 ┆ + ┆ - ┆ LL │\n", | |
| "│ . ┆ chrM ┆ 16217 ┆ chrM ┆ 16300 ┆ - ┆ - ┆ LL │\n", | |
| "│ . ┆ chrM ┆ 16223 ┆ chrM ┆ 16288 ┆ - ┆ - ┆ LL │\n", | |
| "│ . ┆ chrM ┆ 16243 ┆ chrM ┆ 16265 ┆ - ┆ - ┆ LL │\n", | |
| "│ . ┆ chrM ┆ 16272 ┆ chrM ┆ 16295 ┆ - ┆ - ┆ LL │\n", | |
| "└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time sorted_df.collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "id": "4295e534", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "sorted_df.sink_parquet(\"sortPolars.parquet\", compression=\"snappy\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "a9811098", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "### QUICKLY SORT 1,10,2\n", | |
| "\"\"\"sort pairs files \n", | |
| " (the lexicographic order for chromosomes, \n", | |
| " the numeric order for the positions, \n", | |
| " the lexicographic order for pair types).\n", | |
| "\"\"\"\n", | |
| "\n", | |
| "filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n", | |
| "filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "id": "570664b2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div><style>\n", | |
| ".dataframe > thead > tr,\n", | |
| ".dataframe > tbody > tr {\n", | |
| " text-align: right;\n", | |
| " white-space: pre-wrap;\n", | |
| "}\n", | |
| "</style>\n", | |
| "<small>shape: (7, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chromosome</th></tr><tr><td>str</td></tr></thead><tbody><tr><td>"chr1"</td></tr><tr><td>"chr2"</td></tr><tr><td>"chr4_GL456216_random"</td></tr><tr><td>"chr10"</td></tr><tr><td>"chr11"</td></tr><tr><td>"chrX"</td></tr><tr><td>"chrUn_GL456381"</td></tr></tbody></table></div>" | |
| ], | |
| "text/plain": [ | |
| "shape: (7, 1)\n", | |
| "┌──────────────────────┐\n", | |
| "│ chromosome │\n", | |
| "│ --- │\n", | |
| "│ str │\n", | |
| "╞══════════════════════╡\n", | |
| "│ chr1 │\n", | |
| "│ chr2 │\n", | |
| "│ chr4_GL456216_random │\n", | |
| "│ chr10 │\n", | |
| "│ chr11 │\n", | |
| "│ chrX │\n", | |
| "│ chrUn_GL456381 │\n", | |
| "└──────────────────────┘" | |
| ] | |
| }, | |
| "execution_count": 35, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "### TEST BLOCK\n", | |
| "\n", | |
| "\n", | |
| "df = pl.DataFrame({\n", | |
| " \"chromosome\": [\"chr1\", \"chrX\", \"chr4_GL456216_random\", \"chr10\", \"chrUn_GL456381\", \"chr2\", \"chr11\"]\n", | |
| "})\n", | |
| "\n", | |
| "\n", | |
| "# Add new columns with parsed values\n", | |
| "df = df.with_columns([\n", | |
| " pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),\n", | |
| " pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),\n", | |
| " pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\")\n", | |
| "])\n", | |
| "\n", | |
| "# Sort by the extracted columns\n", | |
| "sorted_df = df.sort([\"primary\", \"secondary\", \"tertiary\"])\n", | |
| "\n", | |
| "# Drop helper columns if needed\n", | |
| "sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\"])\n", | |
| "\n", | |
| "sorted_df" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "main", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.19" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment