Skip to content

Instantly share code, notes, and snippets.

@simicd
Last active September 8, 2024 06:45
Show Gist options
  • Save simicd/f0e8fcd277bb3fa932369551b97d5b07 to your computer and use it in GitHub Desktop.
Save simicd/f0e8fcd277bb3fa932369551b97d5b07 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Apache Arrow"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1 Compare performance of csv, Parquet and Arrow"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pyarrow.parquet as pq\n",
"import pyarrow as pa\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import psutil"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1 Load and prepare data "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"## Read Palmer Station Penguin dataset from GitHub\n",
"df = pd.read_csv(\"https://raw.githubusercontent.com/allisonhorst/\"\n",
" \"palmerpenguins/47a3476d2147080e7ceccef4cf70105c808f2cbf/\"\n",
" \"data-raw/penguins_raw.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Increase dataset to 1m rows and reset index\n",
"df = df.sample(1_000_000, replace=True).reset_index(drop=True)\n",
"\n",
"\n",
"# Update sample number (0 to 999'999)\n",
"df[\"Sample Number\"] = df.index\n",
"# Add some random variation to numeric columns\n",
"df[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n",
" \"Flipper Length (mm)\", \"Body Mass (g)\"]] = df[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n",
" \"Flipper Length (mm)\", \"Body Mass (g)\"]] \\\n",
" + np.random.rand(df.shape[0], 4)\n",
"\n",
"# Create dataframe where missing numeric values are filled with zero\n",
"df_nonan = df.copy()\n",
"df_nonan[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n",
" \"Flipper Length (mm)\", \"Body Mass (g)\"]] = df[[\"Culmen Length (mm)\", \"Culmen Depth (mm)\", \n",
" \"Flipper Length (mm)\", \"Body Mass (g)\"]].fillna(0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 Write to disk "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\pandas_compat.py:383: FutureWarning: RangeIndex._start is deprecated and will be removed in a future version. Use RangeIndex.start instead\n",
" 'start': level._start,\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\pandas_compat.py:384: FutureWarning: RangeIndex._stop is deprecated and will be removed in a future version. Use RangeIndex.stop instead\n",
" 'stop': level._stop,\n",
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\pandas_compat.py:385: FutureWarning: RangeIndex._step is deprecated and will be removed in a future version. Use RangeIndex.step instead\n",
" 'step': level._step\n"
]
}
],
"source": [
"# Write to csv\n",
"df.to_csv(\"penguin-dataset.csv\")\n",
"\n",
"# Write to parquet\n",
"df.to_parquet(\"penguin-dataset.parquet\")\n",
"\n",
"# Write to Arrow\n",
"# Convert from pandas to Arrow\n",
"table = pa.Table.from_pandas(df)\n",
"# Write out to file\n",
"with pa.OSFile('penguin-dataset.arrow', 'wb') as sink:\n",
" with pa.RecordBatchFileWriter(sink, table.schema) as writer:\n",
" writer.write_table(table)\n",
"\n",
"# Convert from no-NaN pandas to Arrow\n",
"table_nonan = pa.Table.from_pandas(df_nonan)\n",
"# Write out to file\n",
"with pa.OSFile('penguin-dataset-nonan.arrow', 'wb') as sink:\n",
" with pa.RecordBatchFileWriter(sink, table_nonan.schema) as writer:\n",
" writer.write_table(table_nonan)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.3 Reading time - calculate average of numeric column"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.3.1 Read csv and calculate mean"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.63 s ± 159 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"pd.read_csv(\"penguin-dataset.csv\")[\"Flipper Length (mm)\"].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.3.2 Read parquet and calculate mean"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"56.0 ms ± 8.72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"pd.read_parquet(\"penguin-dataset.parquet\", columns=[\"Flipper Length (mm)\"]).mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.3.3 Read Arrow using file API"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"152 ms ± 17.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"with pa.OSFile('penguin-dataset.arrow', 'rb') as source:\n",
" table = pa.ipc.open_file(source).read_all().column(\"Flipper Length (mm)\")\n",
"result = table.to_pandas().mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.3.4 Read Arrow with memory-mapped API with missing values"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17.9 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"source = pa.memory_map('penguin-dataset.arrow', 'r')\n",
"table = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n",
"result = table.to_pandas().mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.3.5 Read Arrow with memory-mapped API without missing values (zero-copy)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.15 ms ± 491 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')\n",
"table = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n",
"result = table.to_pandas().mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.4 Memory consumption - read column"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Measure initial memory consumption\n",
"memory_init = psutil.Process(os.getpid()).memory_info().rss >> 20"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.4.1 Read csv"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"col_csv = pd.read_csv(\"penguin-dataset.csv\")[\"Flipper Length (mm)\"]\n",
"memory_post_csv = psutil.Process(os.getpid()).memory_info().rss >> 20"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.4.2 Read parquet"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"col_parquet = pd.read_parquet(\"penguin-dataset.parquet\", columns=[\"Flipper Length (mm)\"])\n",
"memory_post_parquet = psutil.Process(os.getpid()).memory_info().rss >> 20"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.4.3 Read Arrow using file API"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"with pa.OSFile('penguin-dataset.arrow', 'rb') as source:\n",
" col_arrow_file = pa.ipc.open_file(source).read_all().column(\"Flipper Length (mm)\").to_pandas()\n",
"memory_post_arrowos = psutil.Process(os.getpid()).memory_info().rss >> 20"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.4.4 Read Arrow with memory-mapped API with missing values"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"source = pa.memory_map('penguin-dataset.arrow', 'r')\n",
"table_mmap = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n",
"col_arrow_mapped = table_mmap.to_pandas()\n",
"memory_post_arrowmmap = psutil.Process(os.getpid()).memory_info().rss >> 20"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.4.5 Read Arrow with memory-mapped API without missing values (zero-copy)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')\n",
"table_mmap_zc = pa.ipc.RecordBatchFileReader(source).read_all().column(\"Flipper Length (mm)\")\n",
"col_arrow_mapped_zc = table_mmap_zc.to_pandas()\n",
"memory_post_arrowmmap_zc = psutil.Process(os.getpid()).memory_info().rss >> 20"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.4.6 Display memory consupmtion"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"csv: 48\n",
"Parquet: 9\n",
"Arrow file API: 189\n",
"Arrow memory-mapped API with NaNs: 15\n",
"Arrow memory-mapped API (zero-copy): 0\n",
"\n"
]
}
],
"source": [
"# Print memory consumption\n",
"print(f\"csv: {memory_post_csv - memory_init}\\n\"\n",
" f\"Parquet: {memory_post_parquet - memory_post_csv}\\n\"\n",
" f\"Arrow file API: {memory_post_arrowos - memory_post_parquet}\\n\"\n",
" f\"Arrow memory-mapped API with NaNs: {memory_post_arrowmmap - memory_post_arrowos}\\n\"\n",
" f\"Arrow memory-mapped API (zero-copy): {memory_post_arrowmmap_zc - memory_post_arrowmmap}\\n\")"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": true,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment