Created
August 13, 2024 08:16
-
-
Save ayaksvals/e7bb47a8e18eb5857ecee17b5d530f57 to your computer and use it in GitHub Desktop.
Easy way to convert pairs to parquet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "d30d1c6f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import bioframe\n", | |
"import pypairix\n", | |
"import dask.dataframe as dd\n", | |
"import dask.array as da\n", | |
"import polars as pl\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "489c6689", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<i>naive plan: (run <b>LazyFrame.explain(optimized=True)</b> to see the optimized plan)</i>\n", | |
" <p></p>\n", | |
" <div>Csv SCAN [NIPBL_R1.nodups.pairs.gz]<p></p>PROJECT */8 COLUMNS</div>" | |
], | |
"text/plain": [ | |
"<LazyFrame at 0x2ABFF19300D0>" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Add in the future ENUM type for chr1 and chr2\n", | |
"\n", | |
"polarsDf=pl.scan_csv(\"NIPBL_R1.nodups.pairs.gz\", separator=\"\\t\", schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String}, ignore_errors=True)\n", | |
"#polarsDf=polarsDf.collect()\n", | |
"#polarsDf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "1d911efd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"116" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# To find out all the header columns and amount of them\n", | |
"# df.filter(pl.col(\"read_id\").str.contains(\"#\")).height" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "bb9847b0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"Compression alternatives:\n", | |
" “zstd”: good compression performance\n", | |
" “lz4”: fast compression/decompression\n", | |
" “snappy”: more backwards compatibility guarantees(older parquet readers)\n", | |
"\n", | |
"row_group_size optimal?\n", | |
"\n", | |
"No compression Level, with => more time\n", | |
"lz4 777.02MB 1m 22s 10000\n", | |
"zstd 684.55MB 1m 23s no rows\n", | |
"snappy 773.80MB 1m 20 no rows\n", | |
"gzip 650.21MB 1m 44s n rows\n", | |
"\"\"\"\n", | |
"polarsDf.sink_parquet(\"pairsToPolarsSnappy.parquet\", compression=\"snappy\")\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "f15833ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<i>naive plan: (run <b>LazyFrame.explain(optimized=True)</b> to see the optimized plan)</i>\n", | |
" <p></p>\n", | |
" <div>Parquet SCAN [pairsToPolarsgzip.parquet]<p></p>PROJECT */8 COLUMNS</div>" | |
], | |
"text/plain": [ | |
"<LazyFrame at 0x2AC0091B2880>" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n", | |
"#df = polars.read_parquet('concatenated.parquet')\n", | |
"\n", | |
"#Lazy Df scan\n", | |
"df = pl.scan_parquet('pairsToPolarsgzip.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "a9811098", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><style>\n", | |
".dataframe > thead > tr,\n", | |
".dataframe > tbody > tr {\n", | |
" text-align: right;\n", | |
" white-space: pre-wrap;\n", | |
"}\n", | |
"</style>\n", | |
"<small>shape: (65_220_833, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>"#sorted: chr1-chr2-pos1-pos2"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#shape: upper triangle"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#genome_assembly: unknown"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#chromsize: chr1 197195432"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#chromsize: chr2 181748087"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902279</td><td>"chrY"</td><td>2902428</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902282</td><td>"chrY"</td><td>2902487</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902286</td><td>"chrY"</td><td>2902430</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902375</td><td>"chrY"</td><td>2902555</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902405</td><td>"chrY"</td><td>2902560</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr></tbody></table></div>" | |
], | |
"text/plain": [ | |
"shape: (65_220_833, 8)\n", | |
"┌───────────────────────────┬────────┬─────────┬────────┬─────────┬─────────┬─────────┬────────────┐\n", | |
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n", | |
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n", | |
"╞═══════════════════════════╪════════╪═════════╪════════╪═════════╪═════════╪═════════╪════════════╡\n", | |
"│ #sorted: ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
"│ chr1-chr2-pos1-pos2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", | |
"│ #shape: upper triangle ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
"│ #genome_assembly: unknown ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
"│ #chromsize: chr1 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
"│ 197195432 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", | |
"│ #chromsize: chr2 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
"│ 181748087 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", | |
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", | |
"│ . ┆ chrY ┆ 2902279 ┆ chrY ┆ 2902428 ┆ + ┆ - ┆ LL │\n", | |
"│ . ┆ chrY ┆ 2902282 ┆ chrY ┆ 2902487 ┆ + ┆ - ┆ LL │\n", | |
"│ . ┆ chrY ┆ 2902286 ┆ chrY ┆ 2902430 ┆ + ┆ - ┆ LL │\n", | |
"│ . ┆ chrY ┆ 2902375 ┆ chrY ┆ 2902555 ┆ + ┆ - ┆ LL │\n", | |
"│ . ┆ chrY ┆ 2902405 ┆ chrY ┆ 2902560 ┆ + ┆ - ┆ LL │\n", | |
"└───────────────────────────┴────────┴─────────┴────────┴─────────┴─────────┴─────────┴────────────┘" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.collect()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "main", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.19" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment