Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Last active September 23, 2024 16:35
Show Gist options
  • Save ayaksvals/c43aa98953a0174b7cd7a0315555df65 to your computer and use it in GitHub Desktop.
Save ayaksvals/c43aa98953a0174b7cd7a0315555df65 to your computer and use it in GitHub Desktop.
Converter from pairs.gz to parquet. Contains Polars, DuckDB and Dask Versions
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import duckdb\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"con = duckdb.connect(\":memory:\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x2b1e4a25d5f0>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"PRAGMA temp_directory='/users/slavska.olesia/scratch/slavska.olesia';\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x2b1e4a25d5f0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"PRAGMA memory_limit='55GB';\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x2b1e4a25d5f0>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"SET enable_progress_bar = true;\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 257 ms, sys: 66.2 ms, total: 323 ms\n",
"Wall time: 354 ms\n"
]
}
],
"source": [
"%%time\n",
"\n",
"input_file = '/users/slavska.olesia/projects/lesia/Micro-C_RAD21_IAA_rep4.mm10.dups.pairs.gz'\n",
"\n",
"pairs = con.read_csv(\n",
" input_file, \n",
" sep=\"\\t\", \n",
" header=False, \n",
" parallel=True, \n",
" skiprows=500,\n",
" names=[\"read_id\", \"chrom1\", \"pos1\", \"chrom2\", \"pos2\", \"strand1\", \"strand2\", \"pair_type\", \"mapq1\", \"mapq2\"]\n",
")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b7424b19220a429d81f988a52a355ea1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 7min 25s, sys: 2min 15s, total: 9min 41s\n",
"Wall time: 1min 44s\n"
]
}
],
"source": [
"%%time\n",
"pairs.write_parquet(\"test.pq\", row_group_size=1_000_000, compression=\"zstd\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment