cchwala · January 30, 2019 20:52
diff --git a/pandas_limit_nan_interpolation_to_max_gap_length.ipynb b/pandas_limit_nan_interpolation_to_max_gap_length.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing the performance of different implementations for limiting `DataFrame.interpolate` to only do something for NaN-gaps shorter then a defined limit\n",
    "\n",
    "The code stems from [this stackoverflow thread](https://stackoverflow.com/questions/48933165/pandas-dataframe-interpolating-in-sections-delimited-by-indexes)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_data():\n",
    "    dictx = {'col1':[1,'nan','nan','nan',5,'nan',7,'nan',9,'nan','nan','nan',13]*1000,\n",
    "    'col2':[20,'nan','nan','nan',22,'nan',25,'nan',30,'nan','nan','nan',25]*1000,\n",
    "    'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9]*1000}\n",
    "    return pd.DataFrame(dictx).astype(float)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Method 1 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = make_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 624 ms, sys: 11.2 ms, total: 635 ms\n",
      "Wall time: 651 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "limit = 2\n",
    "notnull = pd.notnull(df).all(axis=1)\n",
    "# assign group numbers to the rows of df. Each group starts with a non-null row,\n",
    "# followed by null rows\n",
    "group = notnull.cumsum()\n",
    "# find the index of groups having length > limit\n",
    "ignore = (df.groupby(group).filter(lambda grp: len(grp)>limit)).index\n",
    "# only ignore rows which are null\n",
    "ignore = df.loc[~notnull].index.intersection(ignore)\n",
    "keep = df.index.difference(ignore)\n",
    "# interpolate only the kept rows\n",
    "df.loc[keep] = df.loc[keep].interpolate()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Method 2 (this is 4 times slower)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = make_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.46 s, sys: 83.3 ms, total: 2.55 s\n",
      "Wall time: 2.54 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "def interp(df, limit):\n",
    "    d = df.notna().rolling(limit + 1).agg(any).fillna(1)\n",
    "    d = pd.concat({\n",
    "        i: d.shift(-i).fillna(1)\n",
    "        for i in range(limit + 1)\n",
    "    }).prod(level=1)\n",
    "\n",
    "    return df.interpolate(limit=limit).where(d.astype(bool))\n",
    "\n",
    "df2 = df2.pipe(interp, 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check if both generate the same result "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.testing.assert_frame_equal(df, df2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The pure `DataFrame.interpolate()` is 50 times faster than method 1... "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = make_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 11 ms, sys: 1.89 ms, total: 12.9 ms\n",
      "Wall time: 11.7 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "foo = df.interpolate(limit=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# My new Method 3 \n",
    "using `cumsum()` of NaNs and `diff()` to calculate the gap width"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def interpolate_nan_max_gap_series(s, maxgap):        \n",
    "    df = pd.DataFrame(s)\n",
    "    \n",
    "    # Calculate NaN gap width\n",
    "    df['nan_gap_width'] = (s\n",
    "                           .isnull()\n",
    "                           .cumsum()\n",
    "                           .loc[~s.isnull()]\n",
    "                           .diff()\n",
    "                          )\n",
    "    df['nan_gap_width'] = df['nan_gap_width'].fillna(method='bfill')\n",
    "                          \n",
    "    # Interpolate all NaNs but fill back in those in the\n",
    "    # gaps that are longer than the allowed limite\n",
    "    nan_ix = s.isnull()\n",
    "    s = s.interpolate()\n",
    "    s[(df['nan_gap_width'] > maxgap) & nan_ix] = pd.np.nan\n",
    "        \n",
    "    return s\n",
    "\n",
    "def interpolate_nan_max_gap_frame(df, maxgap):\n",
    "    for column_name in df:\n",
    "        df[column_name] = interpolate_nan_max_gap_series(df[column_name], maxgap=maxgap)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = make_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.42 ms, sys: 1.82 ms, total: 11.2 ms\n",
      "Wall time: 10.1 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "res = interpolate_nan_max_gap_series(df3.col1, maxgap=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = make_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 26.1 ms, sys: 2.27 ms, total: 28.4 ms\n",
      "Wall time: 27.1 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "res = interpolate_nan_max_gap_frame(df3, maxgap=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = make_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.05 ms, sys: 1.22 ms, total: 5.26 ms\n",
      "Wall time: 4.32 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "res = df3.col1.interpolate(limit=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = make_data()\n",
    "res = interpolate_nan_max_gap_frame(df3, maxgap=2)\n",
    "pd.testing.assert_frame_equal(df2, res)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Testing the performance of different implementations for limiting `DataFrame.interpolate` to only do something for NaN-gaps shorter then a defined limit\n",
	"\n",
	"The code stems from [this stackoverflow thread](https://stackoverflow.com/questions/48933165/pandas-dataframe-interpolating-in-sections-delimited-by-indexes)."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def make_data():\n",
	" dictx = {'col1':[1,'nan','nan','nan',5,'nan',7,'nan',9,'nan','nan','nan',13]*1000,\n",
	" 'col2':[20,'nan','nan','nan',22,'nan',25,'nan',30,'nan','nan','nan',25]*1000,\n",
	" 'col3':[15,'nan','nan','nan',10,'nan',14,'nan',13,'nan','nan','nan',9]*1000}\n",
	" return pd.DataFrame(dictx).astype(float)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Method 1 "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = make_data()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 624 ms, sys: 11.2 ms, total: 635 ms\n",
	"Wall time: 651 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"limit = 2\n",
	"notnull = pd.notnull(df).all(axis=1)\n",
	"# assign group numbers to the rows of df. Each group starts with a non-null row,\n",
	"# followed by null rows\n",
	"group = notnull.cumsum()\n",
	"# find the index of groups having length > limit\n",
	"ignore = (df.groupby(group).filter(lambda grp: len(grp)>limit)).index\n",
	"# only ignore rows which are null\n",
	"ignore = df.loc[~notnull].index.intersection(ignore)\n",
	"keep = df.index.difference(ignore)\n",
	"# interpolate only the kept rows\n",
	"df.loc[keep] = df.loc[keep].interpolate()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Method 2 (this is 4 times slower)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"df2 = make_data()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 2.46 s, sys: 83.3 ms, total: 2.55 s\n",
	"Wall time: 2.54 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"def interp(df, limit):\n",
	" d = df.notna().rolling(limit + 1).agg(any).fillna(1)\n",
	" d = pd.concat({\n",
	" i: d.shift(-i).fillna(1)\n",
	" for i in range(limit + 1)\n",
	" }).prod(level=1)\n",
	"\n",
	" return df.interpolate(limit=limit).where(d.astype(bool))\n",
	"\n",
	"df2 = df2.pipe(interp, 2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Check if both generate the same result "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"pd.testing.assert_frame_equal(df, df2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# The pure `DataFrame.interpolate()` is 50 times faster than method 1... "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = make_data()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 11 ms, sys: 1.89 ms, total: 12.9 ms\n",
	"Wall time: 11.7 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"foo = df.interpolate(limit=2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# My new Method 3 \n",
	"using `cumsum()` of NaNs and `diff()` to calculate the gap width"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"def interpolate_nan_max_gap_series(s, maxgap): \n",
	" df = pd.DataFrame(s)\n",
	" \n",
	" # Calculate NaN gap width\n",
	" df['nan_gap_width'] = (s\n",
	" .isnull()\n",
	" .cumsum()\n",
	" .loc[~s.isnull()]\n",
	" .diff()\n",
	" )\n",
	" df['nan_gap_width'] = df['nan_gap_width'].fillna(method='bfill')\n",
	" \n",
	" # Interpolate all NaNs but fill back in those in the\n",
	" # gaps that are longer than the allowed limite\n",
	" nan_ix = s.isnull()\n",
	" s = s.interpolate()\n",
	" s[(df['nan_gap_width'] > maxgap) & nan_ix] = pd.np.nan\n",
	" \n",
	" return s\n",
	"\n",
	"def interpolate_nan_max_gap_frame(df, maxgap):\n",
	" for column_name in df:\n",
	" df[column_name] = interpolate_nan_max_gap_series(df[column_name], maxgap=maxgap)\n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"df3 = make_data()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 9.42 ms, sys: 1.82 ms, total: 11.2 ms\n",
	"Wall time: 10.1 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"res = interpolate_nan_max_gap_series(df3.col1, maxgap=2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"df3 = make_data()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 26.1 ms, sys: 2.27 ms, total: 28.4 ms\n",
	"Wall time: 27.1 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"res = interpolate_nan_max_gap_frame(df3, maxgap=2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"df3 = make_data()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 4.05 ms, sys: 1.22 ms, total: 5.26 ms\n",
	"Wall time: 4.32 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"res = df3.col1.interpolate(limit=3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"df3 = make_data()\n",
	"res = interpolate_nan_max_gap_frame(df3, maxgap=2)\n",
	"pd.testing.assert_frame_equal(df2, res)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.15"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}