Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Created August 20, 2024 10:36
Show Gist options
  • Save ayaksvals/3777c989378093fee6782318ed3c9bad to your computer and use it in GitHub Desktop.
Save ayaksvals/3777c989378093fee6782318ed3c9bad to your computer and use it in GitHub Desktop.
Version 2: 4 min for 65_220_653 rows. Scan .parquet, sort, write it down as parquet.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "d30d1c6f",
"metadata": {},
"outputs": [],
"source": [
"import bioframe\n",
"import pypairix\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import polars as pl\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f15833ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 558 µs, sys: 0 ns, total: 558 µs\n",
"Wall time: 566 µs\n"
]
}
],
"source": [
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
"\n",
"%time df = pl.scan_parquet('pairsToPolarsSnappy.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "4be098d1",
"metadata": {},
"outputs": [],
"source": [
"def chromosome_sort_key(chrom):\n",
" if chrom == 'chrX':\n",
" return 100\n",
" elif chrom == 'chrY':\n",
" return 101\n",
" elif chrom == 'chrM':\n",
" return 102\n",
" else:\n",
" return int(chrom[3:])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2ca6e93f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 434 µs, sys: 0 ns, total: 434 µs\n",
"Wall time: 442 µs\n"
]
}
],
"source": [
"### SORT 1,2,10\n",
"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: chromosome_sort_key(x), return_dtype=pl.Int32).alias(\"chrom1_key\"),pl.col(\"chrom2\").map_elements(lambda x: chromosome_sort_key(x), return_dtype=pl.Int32).alias(\"chrom2_key\")])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7e453593",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 67 µs, sys: 28 µs, total: 95 µs\n",
"Wall time: 98.7 µs\n"
]
}
],
"source": [
"%time sorted_df = filtered_df.sort([\"chrom1_key\", \"chrom2_key\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n",
"\n",
"sorted_df = sorted_df.drop([\"chrom1_key\", \"chrom2_key\"])\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "9b238ac5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10min 49s, sys: 1min 10s, total: 12min\n",
"Wall time: 10min 7s\n"
]
},
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000228</td><td>&quot;chr1&quot;</td><td>87485253</td><td>&quot;+&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000302</td><td>&quot;chr1&quot;</td><td>3019411</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000898</td><td>&quot;chr1&quot;</td><td>3002388</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001066</td><td>&quot;chr1&quot;</td><td>3001229</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001303</td><td>&quot;chr1&quot;</td><td>3001526</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16213</td><td>&quot;chrM&quot;</td><td>16244</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16217</td><td>&quot;chrM&quot;</td><td>16300</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16223</td><td>&quot;chrM&quot;</td><td>16288</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16243</td><td>&quot;chrM&quot;</td><td>16265</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16272</td><td>&quot;chrM&quot;</td><td>16295</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (65_220_653, 8)\n",
"┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
"╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n",
"│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000302 ┆ chr1 ┆ 3019411 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000898 ┆ chr1 ┆ 3002388 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3001066 ┆ chr1 ┆ 3001229 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chr1 ┆ 3001303 ┆ chr1 ┆ 3001526 ┆ + ┆ - ┆ LL │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrM ┆ 16213 ┆ chrM ┆ 16244 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16217 ┆ chrM ┆ 16300 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16223 ┆ chrM ┆ 16288 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16243 ┆ chrM ┆ 16265 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16272 ┆ chrM ┆ 16295 ┆ - ┆ - ┆ LL │\n",
"└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#%time sorted_df.collect()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "4295e534",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4min 8s, sys: 2min 13s, total: 6min 22s\n",
"Wall time: 4min 17s\n"
]
}
],
"source": [
"#65_220_653 \n",
"# V1: 12min 13s\n",
"# V2: 4min 17s\n",
"%time sorted_df.sink_parquet(\"sortPolars2.parquet\", compression=\"snappy\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9811098",
"metadata": {},
"outputs": [],
"source": [
"### QUICKLY SORT 1,10,2\n",
"\"\"\"sort pairs files \n",
" (the lexicographic order for chromosomes, \n",
" the numeric order for the positions, \n",
" the lexicographic order for pair types).\n",
"\"\"\"\n",
"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "570664b2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (7, 1)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chromosome</th></tr><tr><td>str</td></tr></thead><tbody><tr><td>&quot;chr1&quot;</td></tr><tr><td>&quot;chr2&quot;</td></tr><tr><td>&quot;chr4_GL456216_random&quot;</td></tr><tr><td>&quot;chr10&quot;</td></tr><tr><td>&quot;chr11&quot;</td></tr><tr><td>&quot;chrX&quot;</td></tr><tr><td>&quot;chrUn_GL456381&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (7, 1)\n",
"┌──────────────────────┐\n",
"│ chromosome │\n",
"│ --- │\n",
"│ str │\n",
"╞══════════════════════╡\n",
"│ chr1 │\n",
"│ chr2 │\n",
"│ chr4_GL456216_random │\n",
"│ chr10 │\n",
"│ chr11 │\n",
"│ chrX │\n",
"│ chrUn_GL456381 │\n",
"└──────────────────────┘"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"### TEST BLOCK\n",
"\n",
"\n",
"df = pl.DataFrame({\n",
" \"chromosome\": [\"chr1\", \"chrX\", \"chr4_GL456216_random\", \"chr10\", \"chrUn_GL456381\", \"chr2\", \"chr11\"]\n",
"})\n",
"\n",
"\n",
"# Add new columns with parsed values\n",
"df = df.with_columns([\n",
" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[0], return_dtype=pl.Int32).alias(\"primary\"),\n",
" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[1], return_dtype=pl.Int32).alias(\"secondary\"),\n",
" pl.col(\"chromosome\").map_elements(lambda x: parse_chromosome(x)[2], return_dtype=pl.Utf8).alias(\"tertiary\")\n",
"])\n",
"\n",
"# Sort by the extracted columns\n",
"sorted_df = df.sort([\"primary\", \"secondary\", \"tertiary\"])\n",
"\n",
"# Drop helper columns if needed\n",
"sorted_df = sorted_df.drop([\"primary\", \"secondary\", \"tertiary\"])\n",
"\n",
"sorted_df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment