Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Created August 13, 2024 10:23
Show Gist options
  • Save ayaksvals/94f9680c96337b7aefdfb125bd440336 to your computer and use it in GitHub Desktop.
Save ayaksvals/94f9680c96337b7aefdfb125bd440336 to your computer and use it in GitHub Desktop.
Sort parquet
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d30d1c6f",
"metadata": {},
"outputs": [],
"source": [
"import bioframe\n",
"import pypairix\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import polars as pl\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f15833ff",
"metadata": {},
"outputs": [],
"source": [
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
"\n",
"df = pl.scan_parquet('pairsToPolarsgzip.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a9811098",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (65_220_657, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000037</td><td>&quot;chr1&quot;</td><td>3139797</td><td>&quot;-&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000088</td><td>&quot;chr1&quot;</td><td>42957984</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000112</td><td>&quot;chr1&quot;</td><td>5448692</td><td>&quot;-&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000227</td><td>&quot;chr1&quot;</td><td>27069655</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chr1&quot;</td><td>3000228</td><td>&quot;chr1&quot;</td><td>87485253</td><td>&quot;+&quot;</td><td>&quot;+&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902279</td><td>&quot;chrY&quot;</td><td>2902428</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902282</td><td>&quot;chrY&quot;</td><td>2902487</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902286</td><td>&quot;chrY&quot;</td><td>2902430</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902375</td><td>&quot;chrY&quot;</td><td>2902555</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902405</td><td>&quot;chrY&quot;</td><td>2902560</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (65_220_657, 8)\n",
"┌─────────┬────────┬─────────┬────────┬──────────┬─────────┬─────────┬────────────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
"╞═════════╪════════╪═════════╪════════╪══════════╪═════════╪═════════╪════════════╡\n",
"│ . ┆ chr1 ┆ 3000037 ┆ chr1 ┆ 3139797 ┆ - ┆ - ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000088 ┆ chr1 ┆ 42957984 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000112 ┆ chr1 ┆ 5448692 ┆ - ┆ + ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000227 ┆ chr1 ┆ 27069655 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chr1 ┆ 3000228 ┆ chr1 ┆ 87485253 ┆ + ┆ + ┆ LL │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrY ┆ 2902279 ┆ chrY ┆ 2902428 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902282 ┆ chrY ┆ 2902487 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902286 ┆ chrY ┆ 2902430 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902375 ┆ chrY ┆ 2902555 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902405 ┆ chrY ┆ 2902560 ┆ + ┆ - ┆ LL │\n",
"└─────────┴────────┴─────────┴────────┴──────────┴─────────┴─────────┴────────────┘"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"sort pairs files \n",
" (the lexicographic order for chromosomes, \n",
" the numeric order for the positions, \n",
" the lexicographic order for pair types).\n",
"\"\"\"\n",
"filtered_df = df.filter(pl.col(\"pos1\").is_not_null())\n",
"filtered_df.sort([\"chrom1\",\"chrom2\", \"pos1\", \"pos2\", \"strand1\", \"strand2\"]).collect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment