Last active
January 30, 2019 20:52
-
-
Save cchwala/568562d9f5eb54de682ac2686372279b to your computer and use it in GitHub Desktop.
pandas_limit_nan_interpolation_to_max_gap_length.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Testing the performance of different implementations for limiting `DataFrame.interpolate` to only do something for NaN-gaps shorter then a defined limit\n", | |
"\n", | |
"The code stems from [this stackoverflow thread](https://stackoverflow.com/questions/48933165/pandas-dataframe-interpolating-in-sections-delimited-by-indexes)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def make_data():\n", | |
" dictx = {'col1':[1,'nan','nan','nan',5,'nan',7,'nan',9,'nan','nan','nan',13]*1000,\n", | |
" 'col2':[20,'nan','nan','nan',22,'nan',25,'nan',30,'nan','nan','nan',25]*1000,\n", | |
" 'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9]*1000}\n", | |
" return pd.DataFrame(dictx).astype(float)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Method 1 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = make_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 624 ms, sys: 11.2 ms, total: 635 ms\n", | |
"Wall time: 651 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"limit = 2\n", | |
"notnull = pd.notnull(df).all(axis=1)\n", | |
"# assign group numbers to the rows of df. Each group starts with a non-null row,\n", | |
"# followed by null rows\n", | |
"group = notnull.cumsum()\n", | |
"# find the index of groups having length > limit\n", | |
"ignore = (df.groupby(group).filter(lambda grp: len(grp)>limit)).index\n", | |
"# only ignore rows which are null\n", | |
"ignore = df.loc[~notnull].index.intersection(ignore)\n", | |
"keep = df.index.difference(ignore)\n", | |
"# interpolate only the kept rows\n", | |
"df.loc[keep] = df.loc[keep].interpolate()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Method 2 (this is 4 times slower)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df2 = make_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.46 s, sys: 83.3 ms, total: 2.55 s\n", | |
"Wall time: 2.54 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"def interp(df, limit):\n", | |
" d = df.notna().rolling(limit + 1).agg(any).fillna(1)\n", | |
" d = pd.concat({\n", | |
" i: d.shift(-i).fillna(1)\n", | |
" for i in range(limit + 1)\n", | |
" }).prod(level=1)\n", | |
"\n", | |
" return df.interpolate(limit=limit).where(d.astype(bool))\n", | |
"\n", | |
"df2 = df2.pipe(interp, 2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Check if both generate the same result " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.testing.assert_frame_equal(df, df2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# The pure `DataFrame.interpolate()` is 50 times faster than method 1... " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = make_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 11 ms, sys: 1.89 ms, total: 12.9 ms\n", | |
"Wall time: 11.7 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"foo = df.interpolate(limit=2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# My new Method 3 \n", | |
"using `cumsum()` of NaNs and `diff()` to calculate the gap width" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def interpolate_nan_max_gap_series(s, maxgap): \n", | |
" df = pd.DataFrame(s)\n", | |
" \n", | |
" # Calculate NaN gap width\n", | |
" df['nan_gap_width'] = (s\n", | |
" .isnull()\n", | |
" .cumsum()\n", | |
" .loc[~s.isnull()]\n", | |
" .diff()\n", | |
" )\n", | |
" df['nan_gap_width'] = df['nan_gap_width'].fillna(method='bfill')\n", | |
" \n", | |
" # Interpolate all NaNs but fill back in those in the\n", | |
" # gaps that are longer than the allowed limite\n", | |
" nan_ix = s.isnull()\n", | |
" s = s.interpolate()\n", | |
" s[(df['nan_gap_width'] > maxgap) & nan_ix] = pd.np.nan\n", | |
" \n", | |
" return s\n", | |
"\n", | |
"def interpolate_nan_max_gap_frame(df, maxgap):\n", | |
" for column_name in df:\n", | |
" df[column_name] = interpolate_nan_max_gap_series(df[column_name], maxgap=maxgap)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df3 = make_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 9.42 ms, sys: 1.82 ms, total: 11.2 ms\n", | |
"Wall time: 10.1 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"res = interpolate_nan_max_gap_series(df3.col1, maxgap=2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df3 = make_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 26.1 ms, sys: 2.27 ms, total: 28.4 ms\n", | |
"Wall time: 27.1 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"res = interpolate_nan_max_gap_frame(df3, maxgap=2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df3 = make_data()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4.05 ms, sys: 1.22 ms, total: 5.26 ms\n", | |
"Wall time: 4.32 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"res = df3.col1.interpolate(limit=3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df3 = make_data()\n", | |
"res = interpolate_nan_max_gap_frame(df3, maxgap=2)\n", | |
"pd.testing.assert_frame_equal(df2, res)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.15" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment