Created
December 15, 2015 20:42
-
-
Save phobson/4603e138982870db9811 to your computer and use it in GitHub Desktop.
upload only new values from one db to another
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Merging DFs to avoid duplicates" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas\n", | |
"\n", | |
"def addSecondColumnLevel(df, levelval, levelname):\n", | |
" \"\"\" Add a second level to the column-index of a dataframe.\n", | |
" \"\"\"\n", | |
"\n", | |
" if isinstance(df.columns, pandas.MultiIndex):\n", | |
" raise ValueError('Dataframe already has MultiIndex on columns')\n", | |
"\n", | |
" origlevel = 'quantity'\n", | |
" if df.columns.names[0] is not None:\n", | |
" origlevel = df.columns.names[0]\n", | |
"\n", | |
" # define the index\n", | |
" colarray = [[levelval]*len(df.columns), df.columns]\n", | |
" colindex = pandas.MultiIndex.from_arrays(colarray)\n", | |
"\n", | |
" # copy the dataframe and redefine the columns\n", | |
" newdf = df.copy()\n", | |
" newdf.columns = colindex\n", | |
" newdf.columns.names = [levelname, origlevel]\n", | |
" return newdf" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Define the existing destination data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th>db</th>\n", | |
" <th colspan=\"2\" halign=\"left\">destination</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>quantity</th>\n", | |
" <th>adult</th>\n", | |
" <th>bebbeh</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>pk_col</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>dog</td>\n", | |
" <td>puppy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>cat</td>\n", | |
" <td>kitten</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>c</th>\n", | |
" <td>bear</td>\n", | |
" <td>cub</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>d</th>\n", | |
" <td>sheep</td>\n", | |
" <td>lamb</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"db destination \n", | |
"quantity adult bebbeh\n", | |
"pk_col \n", | |
"a dog puppy\n", | |
"b cat kitten\n", | |
"c bear cub\n", | |
"d sheep lamb" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data1 = {\n", | |
" 'pk_col': list('abcd'),\n", | |
" 'adult': ['dog', 'cat', 'bear', 'sheep'],\n", | |
" 'bebbeh': ['puppy', 'kitten', 'cub', 'lamb'],\n", | |
"}\n", | |
"df1 = (\n", | |
" pandas.DataFrame(data1)\n", | |
" .set_index(['pk_col']) \n", | |
" .pipe(addSecondColumnLevel, 'destination', 'db')\n", | |
")\n", | |
"df1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Define the most recent source data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th>db</th>\n", | |
" <th colspan=\"2\" halign=\"left\">source</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>quantity</th>\n", | |
" <th>adult</th>\n", | |
" <th>bebbeh</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>pk_col</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>a</th>\n", | |
" <td>dog</td>\n", | |
" <td>puppy</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>b</th>\n", | |
" <td>cat</td>\n", | |
" <td>kitten</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>c</th>\n", | |
" <td>bear</td>\n", | |
" <td>cub</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>d</th>\n", | |
" <td>sheep</td>\n", | |
" <td>lamb</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>e</th>\n", | |
" <td>chicken</td>\n", | |
" <td>chick</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>f</th>\n", | |
" <td>cow</td>\n", | |
" <td>calf</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"db source \n", | |
"quantity adult bebbeh\n", | |
"pk_col \n", | |
"a dog puppy\n", | |
"b cat kitten\n", | |
"c bear cub\n", | |
"d sheep lamb\n", | |
"e chicken chick\n", | |
"f cow calf" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data2 = {\n", | |
" 'pk_col': list('abcdef'),\n", | |
" 'adult': ['dog', 'cat', 'bear', 'sheep', 'chicken', 'cow'],\n", | |
" 'bebbeh': ['puppy', 'kitten', 'cub', 'lamb', 'chick', 'calf'],\n", | |
"}\n", | |
"df2 = (\n", | |
" pandas.DataFrame(data2)\n", | |
" .set_index(['pk_col'])\n", | |
" .pipe(addSecondColumnLevel, 'source', 'db')\n", | |
")\n", | |
"df2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Join the dataframes, keep only the rows not in the `destination`, & upload\n", | |
"Rows not in the `destination` will be null" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>pk_col</th>\n", | |
" <th>adult</th>\n", | |
" <th>bebbeh</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>e</td>\n", | |
" <td>chicken</td>\n", | |
" <td>chick</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>f</td>\n", | |
" <td>cow</td>\n", | |
" <td>calf</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" pk_col adult bebbeh\n", | |
"0 e chicken chick\n", | |
"1 f cow calf" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def get_new_values(df, dstcol='destination', srccol='source'):\n", | |
" not_in_destination = pandas.isnull(df[dstcol])\n", | |
" new_df = (\n", | |
" df.copy() # make a copy\n", | |
" [not_in_destination] # nulls out rows where `dest` == `src`\n", | |
" [srccol] # select only the `source` columns\n", | |
" .dropna(how='all') # drop null rows\n", | |
" )\n", | |
" new_df.columns.names = [None]\n", | |
" return new_df\n", | |
"\n", | |
"(\n", | |
" df1.join(df2, how='right')\n", | |
" .pipe(get_new_values)\n", | |
" .reset_index()\n", | |
" #.to_sql('<TABLENAME>', <ENGINE>, if_exists='append', index=False)\n", | |
")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment