phobson · December 15, 2015 20:42
diff --git a/no_dups.ipynb b/no_dups.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merging DFs to avoid duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas\n",
    "\n",
    "def addSecondColumnLevel(df, levelval, levelname):\n",
    "    \"\"\" Add a second level to the column-index of a dataframe.\n",
    "    \"\"\"\n",
    "\n",
    "    if isinstance(df.columns, pandas.MultiIndex):\n",
    "        raise ValueError('Dataframe already has MultiIndex on columns')\n",
    "\n",
    "    origlevel = 'quantity'\n",
    "    if df.columns.names[0] is not None:\n",
    "        origlevel = df.columns.names[0]\n",
    "\n",
    "    # define the index\n",
    "    colarray = [[levelval]*len(df.columns), df.columns]\n",
    "    colindex = pandas.MultiIndex.from_arrays(colarray)\n",
    "\n",
    "    # copy the dataframe and redefine the columns\n",
    "    newdf = df.copy()\n",
    "    newdf.columns = colindex\n",
    "    newdf.columns.names = [levelname, origlevel]\n",
    "    return newdf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the existing destination data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>db</th>\n",
       "      <th colspan=\"2\" halign=\"left\">destination</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>quantity</th>\n",
       "      <th>adult</th>\n",
       "      <th>bebbeh</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pk_col</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>dog</td>\n",
       "      <td>puppy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>cat</td>\n",
       "      <td>kitten</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>bear</td>\n",
       "      <td>cub</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>sheep</td>\n",
       "      <td>lamb</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "db       destination        \n",
       "quantity       adult  bebbeh\n",
       "pk_col                      \n",
       "a                dog   puppy\n",
       "b                cat  kitten\n",
       "c               bear     cub\n",
       "d              sheep    lamb"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data1 = {\n",
    "    'pk_col': list('abcd'),\n",
    "    'adult': ['dog', 'cat', 'bear', 'sheep'],\n",
    "    'bebbeh': ['puppy', 'kitten', 'cub', 'lamb'],\n",
    "}\n",
    "df1 = (\n",
    "    pandas.DataFrame(data1)\n",
    "        .set_index(['pk_col']) \n",
    "        .pipe(addSecondColumnLevel, 'destination', 'db')\n",
    ")\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the most recent source data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th>db</th>\n",
       "      <th colspan=\"2\" halign=\"left\">source</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>quantity</th>\n",
       "      <th>adult</th>\n",
       "      <th>bebbeh</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pk_col</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>a</th>\n",
       "      <td>dog</td>\n",
       "      <td>puppy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>cat</td>\n",
       "      <td>kitten</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c</th>\n",
       "      <td>bear</td>\n",
       "      <td>cub</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d</th>\n",
       "      <td>sheep</td>\n",
       "      <td>lamb</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>e</th>\n",
       "      <td>chicken</td>\n",
       "      <td>chick</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>f</th>\n",
       "      <td>cow</td>\n",
       "      <td>calf</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "db         source        \n",
       "quantity    adult  bebbeh\n",
       "pk_col                   \n",
       "a             dog   puppy\n",
       "b             cat  kitten\n",
       "c            bear     cub\n",
       "d           sheep    lamb\n",
       "e         chicken   chick\n",
       "f             cow    calf"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2 = {\n",
    "    'pk_col': list('abcdef'),\n",
    "    'adult': ['dog', 'cat', 'bear', 'sheep', 'chicken', 'cow'],\n",
    "    'bebbeh': ['puppy', 'kitten', 'cub', 'lamb', 'chick', 'calf'],\n",
    "}\n",
    "df2 = (\n",
    "    pandas.DataFrame(data2)\n",
    "        .set_index(['pk_col'])\n",
    "        .pipe(addSecondColumnLevel, 'source', 'db')\n",
    ")\n",
    "df2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Join the dataframes, keep only the rows not in the `destination`, & upload\n",
    "Rows not in the `destination` will be null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pk_col</th>\n",
       "      <th>adult</th>\n",
       "      <th>bebbeh</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>e</td>\n",
       "      <td>chicken</td>\n",
       "      <td>chick</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f</td>\n",
       "      <td>cow</td>\n",
       "      <td>calf</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pk_col    adult bebbeh\n",
       "0      e  chicken  chick\n",
       "1      f      cow   calf"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_new_values(df, dstcol='destination', srccol='source'):\n",
    "    not_in_destination = pandas.isnull(df[dstcol])\n",
    "    new_df = (\n",
    "        df.copy()               # make a copy\n",
    "          [not_in_destination]  # nulls out rows where `dest` == `src`\n",
    "          [srccol]              # select only the `source` columns\n",
    "          .dropna(how='all')    # drop null rows\n",
    "    )\n",
    "    new_df.columns.names = [None]\n",
    "    return new_df\n",
    "\n",
    "(\n",
    "    df1.join(df2, how='right')\n",
    "       .pipe(get_new_values)\n",
    "       .reset_index()\n",
    "       #.to_sql('<TABLENAME>', <ENGINE>, if_exists='append', index=False)\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Merging DFs to avoid duplicates"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas\n",
	"\n",
	"def addSecondColumnLevel(df, levelval, levelname):\n",
	" \"\"\" Add a second level to the column-index of a dataframe.\n",
	" \"\"\"\n",
	"\n",
	" if isinstance(df.columns, pandas.MultiIndex):\n",
	" raise ValueError('Dataframe already has MultiIndex on columns')\n",
	"\n",
	" origlevel = 'quantity'\n",
	" if df.columns.names[0] is not None:\n",
	" origlevel = df.columns.names[0]\n",
	"\n",
	" # define the index\n",
	" colarray = [[levelval]*len(df.columns), df.columns]\n",
	" colindex = pandas.MultiIndex.from_arrays(colarray)\n",
	"\n",
	" # copy the dataframe and redefine the columns\n",
	" newdf = df.copy()\n",
	" newdf.columns = colindex\n",
	" newdf.columns.names = [levelname, origlevel]\n",
	" return newdf"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Define the existing destination data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr>\n",
	" <th>db</th>\n",
	" <th colspan=\"2\" halign=\"left\">destination</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>quantity</th>\n",
	" <th>adult</th>\n",
	" <th>bebbeh</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>pk_col</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>dog</td>\n",
	" <td>puppy</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>cat</td>\n",
	" <td>kitten</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>c</th>\n",
	" <td>bear</td>\n",
	" <td>cub</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>d</th>\n",
	" <td>sheep</td>\n",
	" <td>lamb</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	"db destination \n",
	"quantity adult bebbeh\n",
	"pk_col \n",
	"a dog puppy\n",
	"b cat kitten\n",
	"c bear cub\n",
	"d sheep lamb"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data1 = {\n",
	" 'pk_col': list('abcd'),\n",
	" 'adult': ['dog', 'cat', 'bear', 'sheep'],\n",
	" 'bebbeh': ['puppy', 'kitten', 'cub', 'lamb'],\n",
	"}\n",
	"df1 = (\n",
	" pandas.DataFrame(data1)\n",
	" .set_index(['pk_col']) \n",
	" .pipe(addSecondColumnLevel, 'destination', 'db')\n",
	")\n",
	"df1"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Define the most recent source data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr>\n",
	" <th>db</th>\n",
	" <th colspan=\"2\" halign=\"left\">source</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>quantity</th>\n",
	" <th>adult</th>\n",
	" <th>bebbeh</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>pk_col</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>a</th>\n",
	" <td>dog</td>\n",
	" <td>puppy</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>b</th>\n",
	" <td>cat</td>\n",
	" <td>kitten</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>c</th>\n",
	" <td>bear</td>\n",
	" <td>cub</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>d</th>\n",
	" <td>sheep</td>\n",
	" <td>lamb</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>e</th>\n",
	" <td>chicken</td>\n",
	" <td>chick</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>f</th>\n",
	" <td>cow</td>\n",
	" <td>calf</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	"db source \n",
	"quantity adult bebbeh\n",
	"pk_col \n",
	"a dog puppy\n",
	"b cat kitten\n",
	"c bear cub\n",
	"d sheep lamb\n",
	"e chicken chick\n",
	"f cow calf"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data2 = {\n",
	" 'pk_col': list('abcdef'),\n",
	" 'adult': ['dog', 'cat', 'bear', 'sheep', 'chicken', 'cow'],\n",
	" 'bebbeh': ['puppy', 'kitten', 'cub', 'lamb', 'chick', 'calf'],\n",
	"}\n",
	"df2 = (\n",
	" pandas.DataFrame(data2)\n",
	" .set_index(['pk_col'])\n",
	" .pipe(addSecondColumnLevel, 'source', 'db')\n",
	")\n",
	"df2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Join the dataframes, keep only the rows not in the `destination`, & upload\n",
	"Rows not in the `destination` will be null"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>pk_col</th>\n",
	" <th>adult</th>\n",
	" <th>bebbeh</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>e</td>\n",
	" <td>chicken</td>\n",
	" <td>chick</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>f</td>\n",
	" <td>cow</td>\n",
	" <td>calf</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" pk_col adult bebbeh\n",
	"0 e chicken chick\n",
	"1 f cow calf"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"def get_new_values(df, dstcol='destination', srccol='source'):\n",
	" not_in_destination = pandas.isnull(df[dstcol])\n",
	" new_df = (\n",
	" df.copy() # make a copy\n",
	" [not_in_destination] # nulls out rows where `dest` == `src`\n",
	" [srccol] # select only the `source` columns\n",
	" .dropna(how='all') # drop null rows\n",
	" )\n",
	" new_df.columns.names = [None]\n",
	" return new_df\n",
	"\n",
	"(\n",
	" df1.join(df2, how='right')\n",
	" .pipe(get_new_values)\n",
	" .reset_index()\n",
	" #.to_sql('<TABLENAME>', <ENGINE>, if_exists='append', index=False)\n",
	")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}