Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Created August 13, 2024 08:16
Show Gist options
  • Save ayaksvals/e7bb47a8e18eb5857ecee17b5d530f57 to your computer and use it in GitHub Desktop.
Save ayaksvals/e7bb47a8e18eb5857ecee17b5d530f57 to your computer and use it in GitHub Desktop.
Easy way to convert pairs to parquet
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "d30d1c6f",
"metadata": {},
"outputs": [],
"source": [
"import bioframe\n",
"import pypairix\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import polars as pl\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "489c6689",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<i>naive plan: (run <b>LazyFrame.explain(optimized=True)</b> to see the optimized plan)</i>\n",
" <p></p>\n",
" <div>Csv SCAN [NIPBL_R1.nodups.pairs.gz]<p></p>PROJECT */8 COLUMNS</div>"
],
"text/plain": [
"<LazyFrame at 0x2ABFF19300D0>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Add in the future ENUM type for chr1 and chr2\n",
"\n",
"polarsDf=pl.scan_csv(\"NIPBL_R1.nodups.pairs.gz\", separator=\"\\t\", schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String}, ignore_errors=True)\n",
"#polarsDf=polarsDf.collect()\n",
"#polarsDf"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1d911efd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"116"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To find out all the header columns and amount of them\n",
"# df.filter(pl.col(\"read_id\").str.contains(\"#\")).height"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bb9847b0",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"Compression alternatives:\n",
" “zstd”: good compression performance\n",
" “lz4”: fast compression/decompression\n",
" “snappy”: more backwards compatibility guarantees(older parquet readers)\n",
"\n",
"row_group_size optimal?\n",
"\n",
"No compression Level, with => more time\n",
"lz4 777.02MB 1m 22s 10000\n",
"zstd 684.55MB 1m 23s no rows\n",
"snappy 773.80MB 1m 20 no rows\n",
"gzip 650.21MB 1m 44s n rows\n",
"\"\"\"\n",
"polarsDf.sink_parquet(\"pairsToPolarsSnappy.parquet\", compression=\"snappy\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f15833ff",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<i>naive plan: (run <b>LazyFrame.explain(optimized=True)</b> to see the optimized plan)</i>\n",
" <p></p>\n",
" <div>Parquet SCAN [pairsToPolarsgzip.parquet]<p></p>PROJECT */8 COLUMNS</div>"
],
"text/plain": [
"<LazyFrame at 0x2AC0091B2880>"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
"#df = polars.read_parquet('concatenated.parquet')\n",
"\n",
"#Lazy Df scan\n",
"df = pl.scan_parquet('pairsToPolarsgzip.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a9811098",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (65_220_833, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;#sorted: chr1-chr2-pos1-pos2&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#shape: upper triangle&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#genome_assembly: unknown&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#chromsize: chr1 197195432&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#chromsize: chr2 181748087&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902279</td><td>&quot;chrY&quot;</td><td>2902428</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902282</td><td>&quot;chrY&quot;</td><td>2902487</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902286</td><td>&quot;chrY&quot;</td><td>2902430</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902375</td><td>&quot;chrY&quot;</td><td>2902555</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902405</td><td>&quot;chrY&quot;</td><td>2902560</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (65_220_833, 8)\n",
"┌───────────────────────────┬────────┬─────────┬────────┬─────────┬─────────┬─────────┬────────────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
"╞═══════════════════════════╪════════╪═════════╪════════╪═════════╪═════════╪═════════╪════════════╡\n",
"│ #sorted: ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ chr1-chr2-pos1-pos2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ #shape: upper triangle ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ #genome_assembly: unknown ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ #chromsize: chr1 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 197195432 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ #chromsize: chr2 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 181748087 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrY ┆ 2902279 ┆ chrY ┆ 2902428 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902282 ┆ chrY ┆ 2902487 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902286 ┆ chrY ┆ 2902430 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902375 ┆ chrY ┆ 2902555 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902405 ┆ chrY ┆ 2902560 ┆ + ┆ - ┆ LL │\n",
"└───────────────────────────┴────────┴─────────┴────────┴─────────┴─────────┴─────────┴────────────┘"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.collect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment