Last active
September 8, 2024 06:45
-
-
Save simicd/f0e8fcd277bb3fa932369551b97d5b07 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Apache Arrow" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 1 Compare performance of csv, Parquet and Arrow" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pyarrow.parquet as pq\n", | |
"import pyarrow as pa\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import os\n", | |
"import psutil" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 1.1 Load and prepare data " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"## Read Palmer Station Penguin dataset from GitHub\n", | |
"df = pd.read_csv(\"https://raw.githubusercontent.com/allisonhorst/\"\n", | |
" \"palmerpenguins/47a3476d2147080e7ceccef4cf70105c808f2cbf/\"\n", | |
" \"data-raw/penguins_raw.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Increase dataset to 1m rows and reset index\n", | |
"df = df.sample(1_000_000, replace=True).reset_index(drop=True)\n", | |
"\n", | |
"\n", | |
"# Update sample number (0 to 999'999)\n", | |
"df[\"Sample Number\"] = df.index\n", | |
"# Add some random variation to numeric columns\n", | |
"df[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n", | |
" \"Flipper Length (mm)\", \"Body Mass (g)\"]] = df[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n", | |
" \"Flipper Length (mm)\", \"Body Mass (g)\"]] \\\n", | |
" + np.random.rand(df.shape[0], 4)\n", | |
"\n", | |
"# Create dataframe where missing numeric values are filled with zero\n", | |
"df_nonan = df.copy()\n", | |
"df_nonan[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n", | |
" \"Flipper Length (mm)\", \"Body Mass (g)\"]] = df[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n", | |
" \"Flipper Length (mm)\", \"Body Mass (g)\"]].fillna(0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 1.2 Write to disk " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\pandas_compat.py:383: FutureWarning: RangeIndex._start is deprecated and will be removed in a future version. Use RangeIndex.start instead\n", | |
" 'start': level._start,\n", | |
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\pandas_compat.py:384: FutureWarning: RangeIndex._stop is deprecated and will be removed in a future version. Use RangeIndex.stop instead\n", | |
" 'stop': level._stop,\n", | |
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\pandas_compat.py:385: FutureWarning: RangeIndex._step is deprecated and will be removed in a future version. Use RangeIndex.step instead\n", | |
" 'step': level._step\n" | |
] | |
} | |
], | |
"source": [ | |
"# Write to csv\n", | |
"df.to_csv(\"penguin-dataset.csv\")\n", | |
"\n", | |
"# Write to parquet\n", | |
"df.to_parquet(\"penguin-dataset.parquet\")\n", | |
"\n", | |
"# Write to Arrow\n", | |
"# Convert from pandas to Arrow\n", | |
"table = pa.Table.from_pandas(df)\n", | |
"# Write out to file\n", | |
"with pa.OSFile('penguin-dataset.arrow', 'wb') as sink:\n", | |
" with pa.RecordBatchFileWriter(sink, table.schema) as writer:\n", | |
" writer.write_table(table)\n", | |
"\n", | |
"# Convert from no-NaN pandas to Arrow\n", | |
"table_nonan = pa.Table.from_pandas(df_nonan)\n", | |
"# Write out to file\n", | |
"with pa.OSFile('penguin-dataset-nonan.arrow', 'wb') as sink:\n", | |
" with pa.RecordBatchFileWriter(sink, table_nonan.schema) as writer:\n", | |
" writer.write_table(table_nonan)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 1.3 Reading time - calculate average of numeric column" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.3.1 Read csv and calculate mean" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.63 s ± 159 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"pd.read_csv(\"penguin-dataset.csv\")[\"Flipper Length (mm)\"].mean()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.3.2 Read parquet and calculate mean" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"56.0 ms ± 8.72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"pd.read_parquet(\"penguin-dataset.parquet\", columns=[\"Flipper Length (mm)\"]).mean()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.3.3 Read Arrow using file API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"152 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"with pa.OSFile('penguin-dataset.arrow', 'rb') as source:\n", | |
" table = pa.ipc.open_file(source).read_all().column(\"Flipper Length (mm)\")\n", | |
"result = table.to_pandas().mean()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.3.4 Read Arrow with memory-mapped API with missing values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"17.9 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"source = pa.memory_map('penguin-dataset.arrow', 'r')\n", | |
"table = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n", | |
"result = table.to_pandas().mean()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.3.5 Read Arrow with memory-mapped API without missing values (zero-copy)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"6.15 ms ± 491 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')\n", | |
"table = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n", | |
"result = table.to_pandas().mean()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 1.4 Memory consumption - read column" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Measure initial memory consumption\n", | |
"memory_init = psutil.Process(os.getpid()).memory_info().rss >> 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.4.1 Read csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"col_csv = pd.read_csv(\"penguin-dataset.csv\")[\"Flipper Length (mm)\"]\n", | |
"memory_post_csv = psutil.Process(os.getpid()).memory_info().rss >> 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.4.2 Read parquet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"col_parquet = pd.read_parquet(\"penguin-dataset.parquet\", columns=[\"Flipper Length (mm)\"])\n", | |
"memory_post_parquet = psutil.Process(os.getpid()).memory_info().rss >> 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.4.3 Read Arrow using file API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with pa.OSFile('penguin-dataset.arrow', 'rb') as source:\n", | |
" col_arrow_file = pa.ipc.open_file(source).read_all().column(\"Flipper Length (mm)\").to_pandas()\n", | |
"memory_post_arrowos = psutil.Process(os.getpid()).memory_info().rss >> 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.4.4 Read Arrow with memory-mapped API with missing values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"source = pa.memory_map('penguin-dataset.arrow', 'r')\n", | |
"table_mmap = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n", | |
"col_arrow_mapped = table_mmap.to_pandas()\n", | |
"memory_post_arrowmmap = psutil.Process(os.getpid()).memory_info().rss >> 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.4.5 Read Arrow with memory-mapped API without missing values (zero-copy)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')\n", | |
"table_mmap_zc = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n", | |
"col_arrow_mapped_zc = table_mmap_zc.to_pandas()\n", | |
"memory_post_arrowmmap_zc = psutil.Process(os.getpid()).memory_info().rss >> 20" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### 1.4.6 Display memory consupmtion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"csv: 48\n", | |
"Parquet: 9\n", | |
"Arrow file API: 189\n", | |
"Arrow memory-mapped API with NaNs: 15\n", | |
"Arrow memory-mapped API (zero-copy): 0\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# Print memory consumption\n", | |
"print(f\"csv: {memory_post_csv - memory_init}\\n\"\n", | |
" f\"Parquet: {memory_post_parquet - memory_post_csv}\\n\"\n", | |
" f\"Arrow file API: {memory_post_arrowos - memory_post_parquet}\\n\"\n", | |
" f\"Arrow memory-mapped API with NaNs: {memory_post_arrowmmap - memory_post_arrowos}\\n\"\n", | |
" f\"Arrow memory-mapped API (zero-copy): {memory_post_arrowmmap_zc - memory_post_arrowmmap}\\n\")" | |
] | |
} | |
], | |
"metadata": { | |
"hide_input": false, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.1" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": true, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment