Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Created September 23, 2024 17:33
Show Gist options
  • Save ayaksvals/c9f29fe99e0a5192829ef9a29e09d7cc to your computer and use it in GitHub Desktop.
Save ayaksvals/c9f29fe99e0a5192829ef9a29e09d7cc to your computer and use it in GitHub Desktop.
Sorting pairs.gz files
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d30d1c6f",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f15833ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 283 μs, sys: 1.04 ms, total: 1.32 ms\n",
"Wall time: 1.33 ms\n"
]
}
],
"source": [
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
"\n",
"%time df = pl.scan_parquet('/users/slavska.olesia/test.pq', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2ca6e93f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 434 µs, sys: 0 ns, total: 434 µs\n",
"Wall time: 442 µs\n"
]
}
],
"source": [
"### SORT 1,2,10\n",
"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"%time filtered_df = filtered_df.with_columns([pl.col(\"chrom1\").map_elements(lambda x: chromosome_sort_key(x), return_dtype=pl.Int32).alias(\"chrom1_key\"),pl.col(\"chrom2\").map_elements(lambda x: chromosome_sort_key(x), return_dtype=pl.Int32).alias(\"chrom2_key\")])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7e453593",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 67 µs, sys: 28 µs, total: 95 µs\n",
"Wall time: 98.7 µs\n"
]
}
],
"source": [
"%time sorted_df = filtered_df.sort([\"chrom1_key\", \"chrom2_key\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"])\n",
"\n",
"sorted_df = sorted_df.drop([\"chrom1_key\", \"chrom2_key\"])\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9b238ac5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 46s, sys: 38.1 s, total: 2min 24s\n",
"Wall time: 1min 32s\n"
]
},
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (65_220_653, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000228</td><td>&quot;chr1&quot;</td><td>87485253</td><td>&quot;+&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000302</td><td>&quot;chr1&quot;</td><td>3019411</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000898</td><td>&quot;chr1&quot;</td><td>3002388</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001066</td><td>&quot;chr1&quot;</td><td>3001229</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3001303</td><td>&quot;chr1&quot;</td><td>3001526</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16213</td><td>&quot;chrM&quot;</td><td>16244</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16217</td><td>&quot;chrM&quot;</td><td>16300</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16223</td><td>&quot;chrM&quot;</td><td>16288</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16243</td><td>&quot;chrM&quot;</td><td>16265</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrM&quot;</td><td>16272</td><td>&quot;chrM&quot;</td><td>16295</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (65_220_653, 8)\n",
"┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
"╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n",
"│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000302 ┆ chr1 ┆ 3019411 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000898 ┆ chr1 ┆ 3002388 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3001066 ┆ chr1 ┆ 3001229 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chr1 ┆ 3001303 ┆ chr1 ┆ 3001526 ┆ + ┆ - ┆ LL │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrM ┆ 16213 ┆ chrM ┆ 16244 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16217 ┆ chrM ┆ 16300 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16223 ┆ chrM ┆ 16288 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16243 ┆ chrM ┆ 16265 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chrM ┆ 16272 ┆ chrM ┆ 16295 ┆ - ┆ - ┆ LL │\n",
"└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time sorted_df.collect()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "4295e534",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4min 8s, sys: 2min 13s, total: 6min 22s\n",
"Wall time: 4min 17s\n"
]
}
],
"source": [
"#65_220_653 \n",
"# V1: 12min 13s\n",
"# V2: 4min 17s\n",
"%time sorted_df.sink_parquet(\"sortPolars2.parquet\", compression=\"snappy\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a9811098",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (210_349_083, 10)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pair_type</th><th>mapq1</th><th>mapq2</th></tr><tr><td>str</td><td>str</td><td>i64</td><td>str</td><td>i64</td><td>str</td><td>str</td><td>str</td><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3003235</td><td>&quot;chr1&quot;</td><td>3003446</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>60</td><td>60</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3003235</td><td>&quot;chr1&quot;</td><td>3003446</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>60</td><td>60</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3003256</td><td>&quot;chr1&quot;</td><td>3005828</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;DD&quot;</td><td>57</td><td>60</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3003264</td><td>&quot;chr1&quot;</td><td>3003342</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>60</td><td>60</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3003279</td><td>&quot;chr1&quot;</td><td>88256867</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>60</td><td>60</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>90829585</td><td>&quot;chrY&quot;</td><td>90829910</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>33</td><td>10</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>90829712</td><td>&quot;chrY&quot;</td><td>90829744</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;DD&quot;</td><td>7</td><td>21</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>90830626</td><td>&quot;chrY&quot;</td><td>90830935</td><td>&quot;+&quot;</td><td>&quot;+&quot;</td><td>&quot;DD&quot;</td><td>18</td><td>49</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>90830895</td><td>&quot;chrY&quot;</td><td>90831480</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>60</td><td>9</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>90831037</td><td>&quot;chrY&quot;</td><td>90831483</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;DD&quot;</td><td>56</td><td>8</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (210_349_083, 10)\n",
"┌─────────┬────────┬──────────┬────────┬───┬─────────┬───────────┬───────┬───────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ … ┆ strand2 ┆ pair_type ┆ mapq1 ┆ mapq2 │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ i64 ┆ str ┆ ┆ str ┆ str ┆ i64 ┆ i64 │\n",
"╞═════════╪════════╪══════════╪════════╪═══╪═════════╪═══════════╪═══════╪═══════╡\n",
"│ . ┆ chr1 ┆ 3003235 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n",
"│ . ┆ chr1 ┆ 3003235 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n",
"│ . ┆ chr1 ┆ 3003256 ┆ chr1 ┆ … ┆ + ┆ DD ┆ 57 ┆ 60 │\n",
"│ . ┆ chr1 ┆ 3003264 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n",
"│ . ┆ chr1 ┆ 3003279 ┆ chr1 ┆ … ┆ - ┆ DD ┆ 60 ┆ 60 │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrY ┆ 90829585 ┆ chrY ┆ … ┆ - ┆ DD ┆ 33 ┆ 10 │\n",
"│ . ┆ chrY ┆ 90829712 ┆ chrY ┆ … ┆ + ┆ DD ┆ 7 ┆ 21 │\n",
"│ . ┆ chrY ┆ 90830626 ┆ chrY ┆ … ┆ + ┆ DD ┆ 18 ┆ 49 │\n",
"│ . ┆ chrY ┆ 90830895 ┆ chrY ┆ … ┆ - ┆ DD ┆ 60 ┆ 9 │\n",
"│ . ┆ chrY ┆ 90831037 ┆ chrY ┆ … ┆ - ┆ DD ┆ 56 ┆ 8 │\n",
"└─────────┴────────┴──────────┴────────┴───┴─────────┴───────────┴───────┴───────┘"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"### QUICKLY SORT 1,10,2\n",
"\"\"\"sort pairs files \n",
" (the lexicographic order for chromosomes, \n",
" the numeric order for the positions, \n",
" the lexicographic order for pair types).\n",
"\"\"\"\n",
"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment