{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import duckdb" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "con = duckdb.connect(\":memory:\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<duckdb.duckdb.DuckDBPyConnection at 0x2b02a82909f0>" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"PRAGMA temp_directory='temp_dir';\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<duckdb.duckdb.DuckDBPyConnection at 0x2b02a82909f0>" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"PRAGMA memory_limit='55GB';\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<duckdb.duckdb.DuckDBPyConnection at 0x2b02a82909f0>" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"SET enable_progress_bar = true;\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 275 ms, sys: 43.1 ms, total: 318 ms\n", "Wall time: 324 ms\n" ] } ], "source": [ "%%time\n", "\n", "input_file = 'large_csv.csv'\n", "output_file = 'output.parquet'\n", "\n", "pairs = con.read_csv(\n", " input_file, \n", " sep=\"\\t\", \n", " header=False, \n", " parallel=True, \n", " skiprows=10000,\n", " names=[\"read_id\", \"chrom1\", \"pos1\", \"chrom2\", \"pos2\", \"strand1\", \"strand2\", \"pair_type\", \"mapq1\", \"mapq2\"]\n", ")\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.75 ms, sys: 0 ns, total: 1.75 ms\n", "Wall time: 1.78 ms\n" ] } ], "source": [ "%%time\n", "rel = pairs.order(\"chrom1, chrom2, pos1, pos2\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fdeae9be975f4976a0440b9ec26ce3f4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 18min 28s, sys: 21min 11s, total: 39min 40s\n", "Wall time: 6min 24s\n" ] } ], "source": [ "%%time\n", "rel.write_parquet(\"test.pq\", row_group_size=100_000_000, compression=\"zstd\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 64G out of memory\n", "# 120 OOM\n", "# with hive_partitioning\n", "# pragma memory_limit=64GB\n", "# SET memory_limit=64GB Kernel crashed\n", "\n", "#30min without any sort\n", "\n", "#4.30min \n", "#3.20, 3.55\n", "\n", "# 8 cores: 30 sec small file\n", "# 6 sec 1 line of giant\n", "\n", "\n", "# 2.5 hours for 7.5G file without any restrictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# PREVIOUS VERSIONS" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "\n", "\n", "\n", "\n", "\n", "\n", "input_file = 'giant.parquet'\n", "output_file = 'giant_sorted_duckdb.parquet'\n", "schema = \"\"\"\n", " read_id: VARCHAR,\n", " chrom1: VARCHAR,\n", " pos1: BIGINT,\n", " chrom2: VARCHAR,\n", " pos2: BIGINT,\n", " strand1: VARCHAR,\n", " strand2: VARCHAR,\n", " pairs_type: VARCHAR\n", "\"\"\"\n", "con.execute(f\"\"\"\n", " COPY (\n", " SELECT *\n", " FROM read_parquet('{input_file}')\n", " ORDER BY chrom1, chrom2, pos1, pos2\n", " \n", " ) TO '{output_file}' (FORMAT PARQUET);\n", "\"\"\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "\n", "input_file = 'giant.parquet'\n", "output_file = 'giant_sorted_duckdb.parquet'\n", "schema = \"\"\"\n", " read_id: VARCHAR,\n", " chrom1: VARCHAR,\n", " pos1: BIGINT,\n", " chrom2: VARCHAR,\n", " pos2: BIGINT,\n", " strand1: VARCHAR,\n", " strand2: VARCHAR,\n", " pairs_type: VARCHAR\n", "\"\"\"\n", "con.execute(f\"\"\"\n", " COPY (\n", " SELECT CAST(read_id AS VARCHAR), CAST(chrom1 AS VARCHAR), \n", " CAST(pos1 AS BIGINT), CAST(chrom2 AS VARCHAR), \n", " CAST(pos2 AS BIGINT), CAST(strand1 AS VARCHAR),\n", " CAST(strand2 AS VARCHAR), CAST(pairs_type AS VARCHAR)\n", " FROM read_parquet('{input_file}')\n", " WHERE read_id NOT LIKE '#%' -- !!!!!!!!!!!!!! Filter out lines that start with '#samheader'\n", " ORDER BY chrom1, chrom2, pos1, pos2\n", " ) TO '{output_file}' (FORMAT PARQUET);\n", "\"\"\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "\n", "input_file = 'giant.csv'\n", "output_file = 'giant.parquet'\n", "schema={\n", " 'read_id': 'VARCHAR',\n", " 'chrom1': 'VARCHAR',\n", " 'pos1': 'INTEGER',\n", " 'chrom2': 'VARCHAR',\n", " 'pos2': 'INTEGER',\n", " 'strand1': 'VARCHAR',\n", " 'strand2': 'VARCHAR',\n", " 'pairs_type': 'VARCHAR'\n", "}\n", "\n", "con.execute(f\"\"\"\n", " COPY (\n", " SELECT *\n", " FROM read_csv('{input_file}', delim='\\t', columns = {schema}, header=true, auto_detect=false, skip=1000)\n", " ORDER BY chrom1, chrom2, pos1, pos2\n", " \n", " ) TO '{output_file}' (FORMAT PARQUET);\n", "\"\"\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "main", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }