Skip to content

Instantly share code, notes, and snippets.

@jiffyclub
Last active October 27, 2020 17:02
Show Gist options
  • Save jiffyclub/ac2e7506428d5e1d587b to your computer and use it in GitHub Desktop.
Save jiffyclub/ac2e7506428d5e1d587b to your computer and use it in GitHub Desktop.
Example of a function to compare two DataFrames independent of row/column ordering and with handling of null values.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:9c26f46f87352d6bedc804325404deca8e9cf8b7e2e0c151b7a2635f27e6d447"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import numpy.testing as npt\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def assert_frames_equal(actual, expected, use_close=False):\n",
" \"\"\"\n",
" Compare DataFrame items by index and column and\n",
" raise AssertionError if any item is not equal.\n",
"\n",
" Ordering is unimportant, items are compared only by label.\n",
" NaN and infinite values are supported.\n",
" \n",
" Parameters\n",
" ----------\n",
" actual : pandas.DataFrame\n",
" expected : pandas.DataFrame\n",
" use_close : bool, optional\n",
" If True, use numpy.testing.assert_allclose instead of\n",
" numpy.testing.assert_equal.\n",
"\n",
" \"\"\"\n",
" if use_close:\n",
" comp = npt.assert_allclose\n",
" else:\n",
" comp = npt.assert_equal\n",
"\n",
" assert (isinstance(actual, pd.DataFrame) and\n",
" isinstance(expected, pd.DataFrame)), \\\n",
" 'Inputs must both be pandas DataFrames.'\n",
"\n",
" for i, exp_row in expected.iterrows():\n",
" assert i in actual.index, 'Expected row {!r} not found.'.format(i)\n",
"\n",
" act_row = actual.loc[i]\n",
"\n",
" for j, exp_item in exp_row.iteritems():\n",
" assert j in act_row.index, \\\n",
" 'Expected column {!r} not found.'.format(j)\n",
"\n",
" act_item = act_row[j]\n",
"\n",
" try:\n",
" comp(act_item, exp_item)\n",
" except AssertionError as e:\n",
" raise AssertionError(\n",
" e.message + '\\n\\nColumn: {!r}\\nRow: {!r}'.format(j, i))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 53
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"expected = pd.DataFrame({'a': [1, np.nan, 3],\n",
" 'b': [np.nan, 5, 6]},\n",
" index=['x', 'y', 'z'])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"actual = pd.DataFrame([[4, 1],\n",
" [6, 3],\n",
" [5, np.nan]],\n",
" index=['x', 'z', 'y'],\n",
" columns=['b', 'a'])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"assert_frames_equal(actual, actual)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"assert_frames_equal(actual, expected)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "\nItems are not equal:\n ACTUAL: 4.0\n DESIRED: nan\n\nColumn: 'b'\nRow: 'x'",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-57-2fa991ae8dd6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0massert_frames_equal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mactual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpected\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-53-fedbc359fc19>\u001b[0m in \u001b[0;36massert_frames_equal\u001b[0;34m(actual, expected)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mAssertionError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m raise AssertionError(e.message + \n\u001b[0;32m---> 26\u001b[0;31m '\\n\\nColumn: {!r}\\nRow: {!r}'.format(j, i))\n\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m: \nItems are not equal:\n ACTUAL: 4.0\n DESIRED: nan\n\nColumn: 'b'\nRow: 'x'"
]
}
],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
@pjk1193
Copy link

pjk1193 commented Feb 5, 2018

Hi, Awesome code.
When trying to run the code, if there are multiple errors, only the first error is given.
Is this intended?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment