Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Last active August 19, 2024 17:23
Show Gist options
  • Select an option

  • Save ayaksvals/200ae68d5e81e489b52ffb53bedd1988 to your computer and use it in GitHub Desktop.

Select an option

Save ayaksvals/200ae68d5e81e489b52ffb53bedd1988 to your computer and use it in GitHub Desktop.
Sort Parquet with polars 1,2,10 not 1,10,2 (really Long Version)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d30d1c6f",
"metadata": {},
"outputs": [],
"source": [
"import bioframe\n",
"import pypairix\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import polars as pl\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f15833ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 151 µs, sys: 49 µs, total: 200 µs\n",
"Wall time: 208 µs\n"
]
}
],
"source": [
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
"\n",
"%time df = pl.scan_parquet('pairsToPolarsSnappy.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "4be098d1",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def parse_chromosome(chromosome):\n",
" # Default values for sorting\n",
" primary = 100\n",
" secondary = 10000\n",
" tertiary = \"\"\n",
" \n",
" # Match patterns like \"chr1\", \"chrX\", \"chr4_GL456216_random\"\n",
" match = re.match(r'chr(\\d+|X|Y|Un)(?:_GL(\\d+)|.*)?(_random)?', chromosome)\n",
" if match:\n",
" main_part, gl_part, random_part = match.groups()\n",
" \n",
" # Primary sorting key\n",
" if main_part.isdigit():\n",
" primary = int(main_part)\n",
" elif main_part == 'M':\n",
" primary = 23\n",
" elif main_part == 'X':\n",
" primary = 24\n",
" elif main_part == 'Y':\n",
" primary = 25\n",
" \n",
" # Secondary sorting key\n",
" if gl_part:\n",
" secondary = int(gl_part)\n",
" else:\n",
" secondary = 9999\n",
" \n",
" # Tertiary sorting key\n",
" tertiary = random_part if random_part else \"\"\n",
"\n",
" return (primary, secondary, tertiary)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "2ca6e93f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.97 ms, sys: 0 ns, total: 1.97 ms\n",
"Wall time: 1.98 ms\n"
]
}
],
"source": [
"### SORT 1,2,10\n",
"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),pl.col(\"chrom1\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary2\"),pl.col(\"chrom2\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary2\")])\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "7e453593",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 178 µs, sys: 0 ns, total: 178 µs\n",
"Wall time: 182 µs\n"
]
}
],
"source": [
"%time sorted_df = filtered_df.sort([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n",
"\n",
"sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\", \"primary2\", \"secondary2\", \"tertiary2\"])\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "9b238ac5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10min 49s, sys: 1min 10s, total: 12min\n",
"Wall time: 10min 7s\n"
]
},
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000228</td><td>&quot;chr1&quot;</td><td>87485253</td><td>&quot;+&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000302</td><td>&quot;chr1&quot;</td><td>3019411</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000898</td><td>&quot;chr1&quot;</td><td>3002388</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001066</td><td>&quot;chr1&quot;</td><td>3001229</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001303</td><td>&quot;chr1&quot;</td><td>3001526</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16213</td><td>&quot;chrM&quot;</td><td>16244</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16217</td><td>&quot;chrM&quot;</td><td>16300</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16223</td><td>&quot;chrM&quot;</td><td>16288</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16243</td><td>&quot;chrM&quot;</td><td>16265</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16272</td><td>&quot;chrM&quot;</td><td>16295</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (65_220_653, 8)\n",
"┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
"╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n",
"│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000302 ┆ chr1 ┆ 3019411 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000898 ┆ chr1 ┆ 3002388 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3001066 ┆ chr1 ┆ 3001229 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chr1 ┆ 3001303 ┆ chr1 ┆ 3001526 ┆ + ┆ - ┆ LL │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrM ┆ 16213 ┆ chrM ┆ 16244 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16217 ┆ chrM ┆ 16300 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16223 ┆ chrM ┆ 16288 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16243 ┆ chrM ┆ 16265 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16272 ┆ chrM ┆ 16295 ┆ - ┆ - ┆ LL │\n",
"└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time sorted_df.collect()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "4295e534",
"metadata": {},
"outputs": [],
"source": [
"sorted_df.sink_parquet(\"sortPolars.parquet\", compression=\"snappy\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9811098",
"metadata": {},
"outputs": [],
"source": [
"### QUICKLY SORT 1,10,2\n",
"\"\"\"sort pairs files \n",
" (the lexicographic order for chromosomes, \n",
" the numeric order for the positions, \n",
" the lexicographic order for pair types).\n",
"\"\"\"\n",
"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "570664b2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (7, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chromosome</th></tr><tr><td>str</td></tr></thead><tbody><tr><td>&quot;chr1&quot;</td></tr><tr><td>&quot;chr2&quot;</td></tr><tr><td>&quot;chr4_GL456216_random&quot;</td></tr><tr><td>&quot;chr10&quot;</td></tr><tr><td>&quot;chr11&quot;</td></tr><tr><td>&quot;chrX&quot;</td></tr><tr><td>&quot;chrUn_GL456381&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (7, 1)\n",
"┌──────────────────────┐\n",
"│ chromosome │\n",
"│ --- │\n",
"│ str │\n",
"╞══════════════════════╡\n",
"│ chr1 │\n",
"│ chr2 │\n",
"│ chr4_GL456216_random │\n",
"│ chr10 │\n",
"│ chr11 │\n",
"│ chrX │\n",
"│ chrUn_GL456381 │\n",
"└──────────────────────┘"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"### TEST BLOCK\n",
"\n",
"\n",
"df = pl.DataFrame({\n",
" \"chromosome\": [\"chr1\", \"chrX\", \"chr4_GL456216_random\", \"chr10\", \"chrUn_GL456381\", \"chr2\", \"chr11\"]\n",
"})\n",
"\n",
"\n",
"# Add new columns with parsed values\n",
"df = df.with_columns([\n",
" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),\n",
" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),\n",
" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\")\n",
"])\n",
"\n",
"# Sort by the extracted columns\n",
"sorted_df = df.sort([\"primary\", \"secondary\", \"tertiary\"])\n",
"\n",
"# Drop helper columns if needed\n",
"sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\"])\n",
"\n",
"sorted_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment