Created
September 23, 2024 17:33
-
-
Save ayaksvals/c9f29fe99e0a5192829ef9a29e09d7cc to your computer and use it in GitHub Desktop.
Sorting pairs.gz files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "d30d1c6f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import polars as pl\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "f15833ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 283 μs, sys: 1.04 ms, total: 1.32 ms\n", | |
"Wall time: 1.33 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n", | |
"\n", | |
"%time df = pl.scan_parquet('/users/slavska.olesia/test.pq', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "2ca6e93f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 434 µs, sys: 0 ns, total: 434 µs\n", | |
"Wall time: 442 µs\n" | |
] | |
} | |
], | |
"source": [ | |
"### SORT 1,2,10\n", | |
"\n", | |
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n", | |
"%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: chromosome_sort_key(x), return_dtype=pl.Int32).alias(\"chrom1_key\"),pl.col(\"chrom2\").map_elements(lambda x: chromosome_sort_key(x), return_dtype=pl.Int32).alias(\"chrom2_key\")])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "7e453593", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 67 µs, sys: 28 µs, total: 95 µs\n", | |
"Wall time: 98.7 µs\n" | |
] | |
} | |
], | |
"source": [ | |
"%time sorted_df = filtered_df.sort([\"chrom1_key\", \"chrom2_key\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n", | |
"\n", | |
"sorted_df = sorted_df.drop([\"chrom1_key\", \"chrom2_key\"])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "9b238ac5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 46s, sys: 38.1 s, total: 2min 24s\n", | |
"Wall time: 1min 32s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div><style>\n", | |
".dataframe > thead > tr,\n", | |
".dataframe > tbody > tr {\n", | |
" text-align: right;\n", | |
" white-space: pre-wrap;\n", | |
"}\n", | |
"</style>\n", | |
"<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>"."</td><td>"chr1"</td><td>3000228</td><td>"chr1"</td><td>87485253</td><td>"+"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3000302</td><td>"chr1"</td><td>3019411</td><td>"-"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3000898</td><td>"chr1"</td><td>3002388</td><td>"-"</td><td>"+"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3001066</td><td>"chr1"</td><td>3001229</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chr1"</td><td>3001303</td><td>"chr1"</td><td>3001526</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"."</td><td>"chrM"</td><td>16213</td><td>"chrM"</td><td>16244</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16217</td><td>"chrM"</td><td>16300</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16223</td><td>"chrM"</td><td>16288</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16243</td><td>"chrM"</td><td>16265</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrM"</td><td>16272</td><td>"chrM"</td><td>16295</td><td>"-"</td><td>"-"</td><td>"LL"</td></tr></tbody></table></div>" | |
], | |
"text/plain": [ | |
"shape: (65_220_653, 8)\n", | |
"┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n", | |
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n", | |
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n", | |
"╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n", | |
"│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n", | |
"│ . ┆ chr1 ┆ 3000302 ┆ chr1 ┆ 3019411 ┆ - ┆ + ┆ LL │\n", | |
"│ . ┆ chr1 ┆ 3000898 ┆ chr1 ┆ 3002388 ┆ - ┆ + ┆ LL │\n", | |
"│ . ┆ chr1 ┆ 3001066 ┆ chr1 ┆ 3001229 ┆ + ┆ - ┆ LL │\n", | |
"│ . ┆ chr1 ┆ 3001303 ┆ chr1 ┆ 3001526 ┆ + ┆ - ┆ LL │\n", | |
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", | |
"│ . ┆ chrM ┆ 16213 ┆ chrM ┆ 16244 ┆ + ┆ - ┆ LL │\n", | |
"│ . ┆ chrM ┆ 16217 ┆ chrM ┆ 16300 ┆ - ┆ - ┆ LL │\n", | |
"│ . ┆ chrM ┆ 16223 ┆ chrM ┆ 16288 ┆ - ┆ - ┆ LL │\n", | |
"│ . ┆ chrM ┆ 16243 ┆ chrM ┆ 16265 ┆ - ┆ - ┆ LL │\n", | |
"│ . ┆ chrM ┆ 16272 ┆ chrM ┆ 16295 ┆ - ┆ - ┆ LL │\n", | |
"└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time sorted_df.collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "4295e534", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4min 8s, sys: 2min 13s, total: 6min 22s\n", | |
"Wall time: 4min 17s\n" | |
] | |
} | |
], | |
"source": [ | |
"#65_220_653 \n", | |
"# V1: 12min 13s\n", | |
"# V2: 4min 17s\n", | |
"%time sorted_df.sink_parquet(\"sortPolars2.parquet\", compression=\"snappy\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "a9811098", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><style>\n", | |
".dataframe > thead > tr,\n", | |
".dataframe > tbody > tr {\n", | |
" text-align: right;\n", | |
" white-space: pre-wrap;\n", | |
"}\n", | |
"</style>\n", | |
"<small>shape: (210_349_083, 10)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pair_type</th><th>mapq1</th><th>mapq2</th></tr><tr><td>str</td><td>str</td><td>i64</td><td>str</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>"."</td><td>"chr1"</td><td>3003235</td><td>"chr1"</td><td>3003446</td><td>"+"</td><td>"-"</td><td>"DD"</td><td>60</td><td>60</td></tr><tr><td>"."</td><td>"chr1"</td><td>3003235</td><td>"chr1"</td><td>3003446</td><td>"+"</td><td>"-"</td><td>"DD"</td><td>60</td><td>60</td></tr><tr><td>"."</td><td>"chr1"</td><td>3003256</td><td>"chr1"</td><td>3005828</td><td>"-"</td><td>"+"</td><td>"DD"</td><td>57</td><td>60</td></tr><tr><td>"."</td><td>"chr1"</td><td>3003264</td><td>"chr1"</td><td>3003342</td><td>"-"</td><td>"-"</td><td>"DD"</td><td>60</td><td>60</td></tr><tr><td>"."</td><td>"chr1"</td><td>3003279</td><td>"chr1"</td><td>88256867</td><td>"-"</td><td>"-"</td><td>"DD"</td><td>60</td><td>60</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"."</td><td>"chrY"</td><td>90829585</td><td>"chrY"</td><td>90829910</td><td>"+"</td><td>"-"</td><td>"DD"</td><td>33</td><td>10</td></tr><tr><td>"."</td><td>"chrY"</td><td>90829712</td><td>"chrY"</td><td>90829744</td><td>"-"</td><td>"+"</td><td>"DD"</td><td>7</td><td>21</td></tr><tr><td>"."</td><td>"chrY"</td><td>90830626</td><td>"chrY"</td><td>90830935</td><td>"+"</td><td>"+"</td><td>"DD"</td><td>18</td><td>49</td></tr><tr><td>"."</td><td>"chrY"</td><td>90830895</td><td>"chrY"</td><td>90831480</td><td>"+"</td><td>"-"</td><td>"DD"</td><td>60</td><td>9</td></tr><tr><td>"."</td><td>"chrY"</td><td>90831037</td><td>"chrY"</td><td>90831483</td><td>"-"</td><td>"-"</td><td>"DD"</td><td>56</td><td>8</td></tr></tbody></table></div>" | |
], | |
"text/plain": [ | |
"shape: (210_349_083, 10)\n", | |
"┌─────────┬────────┬──────────┬────────┬───┬─────────┬───────────┬───────┬───────┐\n", | |
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ … ┆ strand2 ┆ pair_type ┆ mapq1 ┆ mapq2 │\n", | |
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
"│ str ┆ str ┆ i64 ┆ str ┆ ┆ str ┆ str ┆ i64 ┆ i64 │\n", | |
"╞═════════╪════════╪══════════╪════════╪═══╪═════════╪═══════════╪═══════╪═══════╡\n", | |
"│ . ┆ chr1 ┆ 3003235 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n", | |
"│ . ┆ chr1 ┆ 3003235 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n", | |
"│ . ┆ chr1 ┆ 3003256 ┆ chr1 ┆ … ┆ + ┆ DD ┆ 57 ┆ 60 │\n", | |
"│ . ┆ chr1 ┆ 3003264 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n", | |
"│ . ┆ chr1 ┆ 3003279 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n", | |
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", | |
"│ . ┆ chrY ┆ 90829585 ┆ chrY ┆ … ┆ - ┆ DD ┆ 33 ┆ 10 │\n", | |
"│ . ┆ chrY ┆ 90829712 ┆ chrY ┆ … ┆ + ┆ DD ┆ 7 ┆ 21 │\n", | |
"│ . ┆ chrY ┆ 90830626 ┆ chrY ┆ … ┆ + ┆ DD ┆ 18 ┆ 49 │\n", | |
"│ . ┆ chrY ┆ 90830895 ┆ chrY ┆ … ┆ - ┆ DD ┆ 60 ┆ 9 │\n", | |
"│ . ┆ chrY ┆ 90831037 ┆ chrY ┆ … ┆ - ┆ DD ┆ 56 ┆ 8 │\n", | |
"└─────────┴────────┴──────────┴────────┴───┴─────────┴───────────┴───────┴───────┘" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"### QUICKLY SORT 1,10,2\n", | |
"\"\"\"sort pairs files \n", | |
" (the lexicographic order for chromosomes, \n", | |
" the numeric order for the positions, \n", | |
" the lexicographic order for pair types).\n", | |
"\"\"\"\n", | |
"\n", | |
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n", | |
"filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "main", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment